def save_data(data_1,
              data_2,
              data_3,
              data_path_1,
              data_path_2,
              data_path_3,
              stop_words_path=''):
    stopwords = read_stopwords(stop_words_path)
    with open(data_path_1, 'w', encoding='utf-8') as f1:
        count_1 = 0
        for line in data_1:
            # print(line)
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f1.write('%s' % seg_line)
                    f1.write('\n')
                    count_1 += 1
        print('train_x_length is ', count_1)

    with open(data_path_2, 'w', encoding='utf-8') as f2:
        count_2 = 0
        for line in data_2:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f2.write('%s' % seg_line)
                    f2.write('\n')
                    count_2 += 1
        print('train_y_length is ', count_2)

    with open(data_path_3, 'w', encoding='utf-8') as f3:
        count_3 = 0
        for line in data_3:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f3.write('%s' % seg_line)
                    f3.write('\n')
                    count_3 += 1
        print('test_y_length is ', count_3)
Exemple #2
0
def save_data(data, path, remove_words, label=False):
    """
    @description: 输入数据并在去除停用词及无效词后进行分词
    @param: data-数据集, path-写入文件的路径, remove_words-待去除的无效词
    @return: None
    """
    with open(path, 'w', encoding='utf-8') as f:
        count = 0
        for line in data:
            if isinstance(line, str):
                seg_list = segment(line.strip())
                seg_list = remove_word(seg_list, remove_words)
                ## 预处理数据为最后一列标签值需填充数据
                if label:
                    if len(seg_list) > 0:
                        seg_line = ' '.join(seg_list)
                        f.write('%s' % seg_line)
                        f.write('\n')
                    else:
                        f.write('随时 联系')
                        f.write('\n')
                    count += 1
                else:
                    if len(seg_list) > 0:
                        seg_line = ' '.join(seg_list)
                        f.write('%s' % seg_line)
                        f.write('\n')
                        count += 1
    print('%s is finished, length is ' % path, count)
def save_data(data_1,
              data_2,
              data_3,
              data_path_1,
              data_path_2,
              data_path_3,
              stop_words_path=''):
    stopwords = read_stopwords(stop_words_path)
    with open(data_path_1, 'w', encoding='utf-8') as f1:
        count = 0
        for line in data_1:
            # print(line)
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                seg_line = ' '.join(seg_list)
                f1.write('%s' % seg_line)
            count += 1
            f1.write('\n')

    with open(data_path_2, 'w', encoding='utf-8') as f2:
        for line in data_2:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                seg_line = ' '.join(seg_list)
                f2.write('%s' % seg_line)
            f2.write('\n')

    with open(data_path_3, 'w', encoding='utf-8') as f3:
        for line in data_3:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                seg_line = ' '.join(seg_list)
                f3.write('%s' % seg_line)
            f3.write('\n')
Exemple #4
0
def segment_for_fasttext(content):
    """在split_to_sentence的基础上,分词并按空格隔开。采用hanlp的StandardTokenizer。处理一个文本。
    处理时用pandas读取数据,然后对每一个文本的行apply该函数。保存为txt,一行为一个文本。

    """
    total_tokens = []
    sents = split_to_sentence(content)
    for sent in chain.from_iterable(sents):
        tokens = [str(item.word) for item in tokenizer.segment(sent)]
        total_tokens.extend(tokens)
    return ' '.join(total_tokens)
Exemple #5
0
def segment_for_lda(sentences, stopwords):
    """在split_to_sentence的基础上,分词并去停用词。采用hanlp的StandardTokenizer。

    return:
        list of tokens for a doc.
    """
    total_tokens = []
    for sent in chain.from_iterable(sentences):
        tokens = [item.word for item in tokenizer.segment(sent) \
                  if item.word not in stopwords]
        total_tokens.extend(tokens)
    return total_tokens
def preprocess_sentence(sentence):
    """
    预处理三步(按一个一个的句子读入):
    将句子切词
    去掉要去掉的词
    把list 重新按空格 封装成string
    :param sentence:
    :return:
    """
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line
def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_line = ' '.join(seg_list)
    return seg_line