def convert(input_path, output_path): with open(input_path) as file: lines = file.readlines() processed_news = [] for line in lines: news_piece = {} sina_id, emotions, text = line.split('\t') emotions = emotions.split(' ') news_piece['total_votes'] = emotions[0].split(':')[1] emotion_vec = [] for e_text in emotions[1:]: e_type, e_votes = e_text.split(':') emotion_vec.append(int(e_votes)) max_vote = max(emotion_vec) if emotion_vec.count(max_vote) > 1: # multiple emotions with highest votes. can't label, skip this entry continue news_piece['label'] = emotion_vec.index(max_vote) news_piece['emotions'] = emotion_vec text = remove_redundant(text) news_piece['text'] = text processed_news.append(news_piece) function.write_json_file(output_path, processed_news) print( f"Finish preprocesssing {input_path}. {str(len(processed_news))} entries. " f"Saved at {output_path}.") return processed_news
def init_char(char_path): char_table = {} with open(char_path, encoding="gbk") as file: line = file.readline() for char in line: char_table[char] = True function.write_json_file(CHAR_TABLE_PATH, char_table) print("init char table done.")
def build_embedding(word2id, ori_emb_path): embedding = {} with open(ori_emb_path) as file: lines = file.readlines() embedding['dimension'] = int(lines[0].split(' ')[1]) emb_list = [[0.0] * embedding['dimension']] * len(word2id) for line in lines[1:]: line_list = line.strip().split(' ') if line_list[0] in word2id: emb_list[word2id[line_list[0]]] = list( map(float, line_list[1:])) embedding['list'] = emb_list function.write_json_file(EMBEDDING_PATH, embedding) print(f"Embedding built. Saved at {EMBEDDING_PATH}.")
def build_vocabulary(texts): word_count = {} for sentence in texts: word_list = sentence.split(' ') for word in word_list: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 function.write_json_file(WORD_COUNT_PATH, word_count) vocab_list = list(word_count.items()) vocab_list.sort(key=lambda x: x[1], reverse=True) vocab_list.extend([(UNKNOWN, 0), (PADDING, 0)]) word2id = {} for idx, word in enumerate(vocab_list): word2id[word[0]] = idx function.write_json_file(WORD2ID_PATH, word2id) print(f"{len(word_count)} words in vocabulary. " f"Saved at {WORD_COUNT_PATH} and {WORD2ID_PATH}.") return word2id
def init_pinyin(pinyin_path): pinyin2char = {} char2pinyin = {} # read file and generate a pinyin2char dict without homograph with open(pinyin_path, encoding="gbk") as file: lines = file.readlines() for line in lines: line_arr = line.strip().split(" ") pinyin2char[line_arr[0]] = line_arr[1:] # homograph init for pinyin, chars in pinyin2char.items(): piyin_new_chars = [] for char in chars: if char in char2pinyin: char2pinyin[char][pinyin] = len(char2pinyin[char]) else: char2pinyin[char] = {pinyin: 0} piyin_new_chars.append(char + str(char2pinyin[char][pinyin])) pinyin2char[pinyin] = piyin_new_chars function.write_json_file(HOMO_DIC_PATH, char2pinyin) function.write_json_file(PINYIN2CHAR_PATH, pinyin2char) print("init pinyin done.")
def train(folder_path, n, model_name): model = Model(n) def train_file(file_path): data = function.read_json_file(file_path) for sentence in list(data): # add (n_gram - 1) 'bb' to the beginning of the sentence and 'ee' to the end sentence = ('bb' * (n - 1)) + sentence + 'ee' model.train(sentence) all_files_paths = os.listdir(folder_path) for rel_path in all_files_paths: path = folder_path + "/" + rel_path print(f"Begin training with {path}") try: train_file(path) except UnicodeDecodeError: print("Illegal file, continue.") print(f"training with {path} finished.") save_path = MODEL_PATH + f"/{model_name}.json" function.write_json_file(save_path, model.to_dict()) print(f"Training finished. Model saved as {save_path}")
def gen_test_set(file_path, test_count=500, only_long_sentence=True): homo_dic = function.read_json_file(HOMO_DIC_PATH) all_sentences = function.read_json_file(file_path) all_length = len(all_sentences) test_index = random.sample(range(0, all_length), test_count) answers = [] inputs = [] char_count = 0 for index in test_index: sentence = all_sentences[index] length = len(sentence) if only_long_sentence and length < 10: continue chars = [sentence[i] for i in range(0, length, 2)] pinyin_ids = [int(sentence[i]) for i in range(1, length, 2)] pinyins = [] for char, pinyin_id in zip(chars, pinyin_ids): for dic_pinyin, dic_id in homo_dic[char].items(): if dic_id == pinyin_id: pinyins.append(dic_pinyin) continue answers.append(''.join(chars) + '\n') inputs.append(' '.join(pinyins) + '\n') char_count += len(chars) new_all_sentences = [] # delete test from training file for index, sentence in enumerate(all_sentences): if index not in test_index: new_all_sentences.append(sentence) function.write_json_file(file_path, new_all_sentences) with open(TEST_INPUT, "a") as file: file.writelines(inputs) with open(TEST_ANSWER, "a", encoding='gbk') as file: file.writelines(answers) print( f"Generate a test set with {len(inputs)} sentences and {char_count} characters. " f"Test input added at {TEST_INPUT}. Answer added at {TEST_ANSWER}")
def process_file(file_path, cnt, batch_name): all_sentences = [] if batch_name == 'sina': with open(file_path, encoding="gbk") as file: lines = file.readlines() for line in lines: news_piece = json.loads(line) title = news_piece["title"] content = news_piece["html"] all_sentences += cut_sentences(title) all_sentences += cut_sentences(content) if batch_name == 'weixin': with open(file_path) as file: lines = file.readlines() length = len(lines) for line_index in range(0, length, 3): # get 1/3 of weixin corpus content = json.loads(lines[line_index])['content'] all_sentences += cut_sentences(content, ignore_number=True) sentences_with_pinyin = [] for sentence in all_sentences: sentences_with_pinyin.append(label_homo(sentence)) save_path = TRAINING_DATA_PATH + f"/{name}-{cnt}.json" function.write_json_file(save_path, sentences_with_pinyin) print(f"{file_path} processed. Saved as {save_path}")