def wikisplit2word(): if os.path.exists(config.CORPUS_DIC + '/wiki_chs'): with open(config.PREDATA_DIC + '/totalpart.txt', 'a', encoding='utf-8') as write_file: print('开始分词') for line in __bigfile.get_lines(config.CORPUS_DIC + '/wiki_chs'): if line: write_file.write(' '.join(jieba.lcut(line))) print('分词结束') else: raise FileNotFoundError('{} 不存在'.format(config.CORPUS_DIC + '/wiki_chs'))
def othersplit2word(filepath: str): if os.path.exists(filepath): with open(config.PREDATA_DIC + '/' + filepath.split('/')[-1], 'a', encoding='utf-8') as write_file: print('开始分词') for line in __bigfile.get_lines(filepath): if line: write_file.write(' '.join(jieba.lcut(line))) print('分词结束') else: raise FileNotFoundError('{} 不存在'.format(filepath))
def deal_tagdata(tagdata_filepaths: list, rate: float = config.SR_RATE): datas = [] for tagdata_filepath in tagdata_filepaths: if os.path.exists(tagdata_filepath): for line in __bigfile.get_lines(tagdata_filepath): datas.append(line) else: raise FileNotFoundError('{} 标注数据文件不存在'.format(tagdata_filepath)) random.shuffle(datas) # 打乱数据 sentences, labels = __split_tagdata(datas) datas.clear() words_list = __tagsentence2regwords(sentences) sentences.clear() sentencevec_list, labelvec_list = __data2vec(words_list, labels) words_list.clear() labels.clear() # 将数据保存下来 total_size = len(labelvec_list) train_x = sentencevec_list[:int(total_size * rate)] train_y = labelvec_list[:int(total_size * rate)] test_x = sentencevec_list[int(total_size * rate):] test_y = labelvec_list[int(total_size * rate):] sentencevec_list.clear() labelvec_list.clear() if rate == 1.0: # 特殊要求 if len(train_x) > 0: np.save(config.PREDATA_DIC + '/strain_x.npy', np.array(train_x)) np.save(config.PREDATA_DIC + '/strain_y.npy', np.array(train_y)) else: raise ValueError('rate为1.0,但数据长度为0') elif rate == 0.0: # 特殊要求 if len(test_x) > 0: np.save(config.PREDATA_DIC + '/stest_x.npy', np.array(test_x)) np.save(config.PREDATA_DIC + '/stest_y.npy', np.array(test_y)) else: raise ValueError('rate为0.0,但数据长度为0') elif rate > 0.0 and rate < 1.0: train_size = len(train_x) test_size = len(test_x) if train_size <= 0 or test_size <= 0: raise ValueError('数据长度为0') # 正常要求 np.save(config.PREDATA_DIC + '/strain_x.npy', np.array(train_x)) np.save(config.PREDATA_DIC + '/strain_y.npy', np.array(train_y)) np.save(config.PREDATA_DIC + '/stest_x.npy', np.array(test_x)) np.save(config.PREDATA_DIC + '/stest_y.npy', np.array(test_y)) else: raise ValueError('rate 超出范围,rate应该在0.0和1.0之间 rate:{}'.format(rate))
def __get_sentences_generator(filepath: str): for resume in __bigfile.get_lines(filepath): yield __splitsentence.resume2sentences(resume)
def __get_inputs_generator(filepath: str): for resume in __bigfile.get_lines(filepath): sentences = __splitsentence.resume2sentences(resume) words_list = srpre.sentence2regwords(sentences) yield srpre.sentence2vec(words_list)