def save_data(data_1, data_2, data_3, data_path_1, data_path_2, data_path_3, stop_words_path=''): stopwords = read_stopwords(stop_words_path) with open(data_path_1, 'w', encoding='utf-8') as f1: count_1 = 0 for line in data_1: # print(line) if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) if len(seg_list) > 0: seg_line = ' '.join(seg_list) f1.write('%s' % seg_line) f1.write('\n') count_1 += 1 print('train_x_length is ', count_1) with open(data_path_2, 'w', encoding='utf-8') as f2: count_2 = 0 for line in data_2: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) if len(seg_list) > 0: seg_line = ' '.join(seg_list) f2.write('%s' % seg_line) f2.write('\n') count_2 += 1 print('train_y_length is ', count_2) with open(data_path_3, 'w', encoding='utf-8') as f3: count_3 = 0 for line in data_3: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) if len(seg_list) > 0: seg_line = ' '.join(seg_list) f3.write('%s' % seg_line) f3.write('\n') count_3 += 1 print('test_y_length is ', count_3)
def save_data(data, path, remove_words, label=False): """ @description: 输入数据并在去除停用词及无效词后进行分词 @param: data-数据集, path-写入文件的路径, remove_words-待去除的无效词 @return: None """ with open(path, 'w', encoding='utf-8') as f: count = 0 for line in data: if isinstance(line, str): seg_list = segment(line.strip()) seg_list = remove_word(seg_list, remove_words) ## 预处理数据为最后一列标签值需填充数据 if label: if len(seg_list) > 0: seg_line = ' '.join(seg_list) f.write('%s' % seg_line) f.write('\n') else: f.write('随时 联系') f.write('\n') count += 1 else: if len(seg_list) > 0: seg_line = ' '.join(seg_list) f.write('%s' % seg_line) f.write('\n') count += 1 print('%s is finished, length is ' % path, count)
def save_data(data_1, data_2, data_3, data_path_1, data_path_2, data_path_3, stop_words_path=''): stopwords = read_stopwords(stop_words_path) with open(data_path_1, 'w', encoding='utf-8') as f1: count = 0 for line in data_1: # print(line) if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) seg_line = ' '.join(seg_list) f1.write('%s' % seg_line) count += 1 f1.write('\n') with open(data_path_2, 'w', encoding='utf-8') as f2: for line in data_2: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) seg_line = ' '.join(seg_list) f2.write('%s' % seg_line) f2.write('\n') with open(data_path_3, 'w', encoding='utf-8') as f3: for line in data_3: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) seg_line = ' '.join(seg_list) f3.write('%s' % seg_line) f3.write('\n')
def segment_for_fasttext(content): """在split_to_sentence的基础上,分词并按空格隔开。采用hanlp的StandardTokenizer。处理一个文本。 处理时用pandas读取数据,然后对每一个文本的行apply该函数。保存为txt,一行为一个文本。 """ total_tokens = [] sents = split_to_sentence(content) for sent in chain.from_iterable(sents): tokens = [str(item.word) for item in tokenizer.segment(sent)] total_tokens.extend(tokens) return ' '.join(total_tokens)
def segment_for_lda(sentences, stopwords): """在split_to_sentence的基础上,分词并去停用词。采用hanlp的StandardTokenizer。 return: list of tokens for a doc. """ total_tokens = [] for sent in chain.from_iterable(sentences): tokens = [item.word for item in tokenizer.segment(sent) \ if item.word not in stopwords] total_tokens.extend(tokens) return total_tokens
def preprocess_sentence(sentence): """ 预处理三步(按一个一个的句子读入): 将句子切词 去掉要去掉的词 把list 重新按空格 封装成string :param sentence: :return: """ seg_list = segment(sentence.strip(), cut_type='word') seg_list = remove_words(seg_list) seg_line = ' '.join(seg_list) return seg_line
def preprocess_sentence(sentence): seg_list = segment(sentence.strip(), cut_type='word') seg_line = ' '.join(seg_list) return seg_line