def len_ratio(src_fname, trg_fname, low, high, encoding): lines_cnt = reader.count_lines(src_fname, encoding) assert lines_cnt == reader.count_lines( trg_fname, encoding), 'line count does not match...' ratio_stat = {} illegal_cnt = 0 with open(src_fname, 'r', encoding=encoding) as src_file, \ open(trg_fname, 'r', encoding=encoding) as trg_file: for src_line, trg_line in zip(src_file, trg_file): sen_len_ratio = len(trg_line.split()) / len(src_line.split()) if sen_len_ratio < low or sen_len_ratio > high: illegal_cnt += 1 print(src_line, trg_line) sen_len_ratio = math.ceil(sen_len_ratio * 10) / 10 if ratio_stat.get(sen_len_ratio) is not None: ratio_stat[sen_len_ratio] += 1 else: ratio_stat[sen_len_ratio] = 1 ratio_stat = list(ratio_stat.items()) assert lines_cnt == sum(map(lambda x: x[1], ratio_stat)) ratio_stat.sort(key=lambda x: x[0]) ratio_stat = dict(ratio_stat) for ratio, cnt in ratio_stat.items(): print('(' + str(ratio - 0.1) + ', ' + str(ratio) + ']', cnt) print('=================================================') print('low =', low, '| high =', high) print('illegal sentences count:', illegal_cnt) tools.plot_bargraph('trg len ratio to src len distribution', ratio_stat)
def segment_src_trg(src_fname, trg_fname, src_seg, trg_seg, char_level, encoding): assert src_fname != src_seg, 'source raw and segmented filename are the same' assert trg_fname != trg_seg, 'target raw and segmented filename are the same' with open(src_fname, 'r', encoding=encoding) as src_file, \ open(trg_fname, 'r', encoding=encoding) as trg_file, \ open(src_seg, 'w', encoding=encoding) as src_seg_file, \ open(trg_seg, 'w', encoding=encoding) as trg_seg_file: lines_cnt = 0 for src_line, trg_line in zip(src_file, trg_file): src_to_write = ' '.join(segment_sen(src_line, char_level)) + '\n' trg_to_write = ' '.join(segment_sen(trg_line, char_level)) + '\n' if empty_pattern.match(src_to_write) or src_to_write in ['']: continue if empty_pattern.match(trg_to_write) or trg_to_write in ['']: continue src_seg_file.write(src_to_write) trg_seg_file.write(trg_to_write) lines_cnt += 1 if lines_cnt % 100000 == 0: print(lines_cnt, 'lines have been processed.') print('=================================================') print(lines_cnt, 'lines have been processed finally.') print('raw file lines count:', reader.count_lines(src_fname, encoding)) segmented_lines_cnt = reader.count_lines(src_seg, encoding) assert segmented_lines_cnt == reader.count_lines(trg_seg, encoding), 'segmented lines count does not match...' print('segmented file lines count:', segmented_lines_cnt)
def setup(self, hyperparams, corpus_params): self.pretrained_word_vecs_fname = corpus_params.pretrained_word_vecs_url self.raw_url = corpus_params.raw_url self.train_fname = corpus_params.train_url self.val_fname = corpus_params.val_url self.test_fname = corpus_params.test_url self.open_encoding = corpus_params.open_file_encoding self.save_encoding = corpus_params.save_file_encoding reader.split_train_val_test(self.raw_url, self.train_fname, self.val_fname, self.test_fname, self.open_encoding, self.save_encoding) run_which_model = model_name_full_abbr[self.__class__.__name__] corpus_name = corpus_name_full_abbr[corpus_params.__class__.__name__] setup_time = tools.get_current_time() self.this_model_save_dir = \ params.RESULT_SAVE_DIR + os.path.sep + \ run_which_model + '_' + corpus_name + '_' + setup_time if not os.path.exists(self.this_model_save_dir): os.makedirs(self.this_model_save_dir) self.hyperparams = hyperparams self.mode = hyperparams.mode self.keep_word_num = hyperparams.keep_word_num self.word_vec_dim = hyperparams.word_vec_dim self.time_step = hyperparams.time_step self.batch_size = hyperparams.batch_size self.tokenizer = reader.fit_tokenizer(self.raw_url, self.keep_word_num, hyperparams.filters, hyperparams.oov_tag, hyperparams.char_level, self.open_encoding) max_id = max(map(lambda x: x[1], list(self.tokenizer.word_index.items()))) self.vocab_size = min(max_id, self.keep_word_num) self.pad = self.hyperparams.pad self.cut = self.hyperparams.cut self.total_samples_count = reader.count_lines(self.raw_url, self.open_encoding) self.train_samples_count = reader.count_lines(self.train_fname, self.open_encoding) self.val_samples_count = reader.count_lines(self.val_fname, self.open_encoding) self.test_samples_count = reader.count_lines(self.test_fname, self.open_encoding) record_info = list() record_info.append('\n================ In setup ================\n') record_info.append('Vocab size: %d\n' % self.vocab_size) record_info.append('Total samples count: %d\n' % self.total_samples_count) record_info.append('Train samples count: %d\n' % self.train_samples_count) record_info.append('Val samples count: %d\n' % self.val_samples_count) record_info.append('Test samples count: %d\n' % self.test_samples_count) record_str = ''.join(record_info) record_url = self.this_model_save_dir + os.path.sep + params.TRAIN_RECORD_FNAME tools.print_save_str(record_str, record_url)
def sim_two_files(first_file, second_file, show_diff, encoding): fir_lines_cnt = reader.count_lines(first_file, encoding) sec_lines_cnt = reader.count_lines(second_file, encoding) same_count = 0 with open(first_file, 'r', encoding=encoding) as first, \ open(second_file, 'r', encoding=encoding) as second: sec_lines = set(second.read().splitlines()) for fir_line in tqdm(first): if fir_line.replace('\n', '') in sec_lines: same_count += 1 elif show_diff: print(fir_line) print('similarity:', str(same_count / fir_lines_cnt), '|', str(same_count / sec_lines_cnt))
def remove_same(raw_fname, after_fname, encoding): assert raw_fname != after_fname, 'raw and trg filename are the same' with open(raw_fname, 'r', encoding=encoding) as raw_file, \ open(after_fname, 'w', encoding=encoding) as after_file: distinct = set() for line in raw_file: if line not in distinct: distinct.add(line) after_file.write(line) print('=================================================') raw_lines_cnt = reader.count_lines(raw_fname, encoding) after_lines_cnt = reader.count_lines(after_fname, encoding) print('before lines count:', raw_lines_cnt) print('after remove:', after_lines_cnt) print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt)/raw_lines_cnt*100))
def main(args): prefix = re.split('/+', args.all_src_fname)[-1].replace('.src', '') train_src = prefix + '.train.tok.src' train_trg = prefix + '.train.tok.trg' dev_src = prefix + '.dev.tok.src' dev_trg = prefix + '.dev.tok.trg' all_train_src = args.all_src_fname all_train_trg = args.all_trg_fname train_src = os.path.join(args.output_dir, train_src) train_trg = os.path.join(args.output_dir, train_trg) dev_src = os.path.join(args.output_dir, dev_src) dev_trg = os.path.join(args.output_dir, dev_trg) all_train_lines_cnt = reader.count_lines(all_train_src, args.encoding) dev_ratio = args.dev_samples_num / all_train_lines_cnt with open(all_train_src, 'r', encoding=args.encoding) as all_src_file, \ open(all_train_trg, 'r', encoding=args.encoding) as all_trg_file, \ open(train_src, 'w', encoding=args.encoding) as train_src_file, \ open(train_trg, 'w', encoding=args.encoding) as train_trg_file, \ open(dev_src, 'w', encoding=args.encoding) as dev_src_file, \ open(dev_trg, 'w', encoding=args.encoding) as dev_trg_file: line_cnt = 0 for src_line, trg_line in zip(all_src_file, all_trg_file): # remove empty src or target sentence pairs if src_line in ['', '\n'] or trg_line in ['', '\n']: continue if rdm.rand() < dev_ratio: dev_src_file.write(src_line) dev_trg_file.write(trg_line) else: train_src_file.write(src_line) train_trg_file.write(trg_line) line_cnt += 1 if line_cnt % 100000 == 0: print(line_cnt, 'lines have been processed.') print('=================================================') print(line_cnt, 'lines have been processed finally.') print('train src data lines count:', reader.count_lines(train_src, args.encoding)) print('train trg data lines count:', reader.count_lines(train_trg, args.encoding)) print('dev src data lines count:', reader.count_lines(dev_src, args.encoding)) print('dev trg data lines count:', reader.count_lines(dev_trg, args.encoding))
def remove_long_short(src_fname, trg_fname, short, long, encoding): suffix = '.remove_short'+str(short)+'_long'+str(long) raw_lines_cnt = reader.count_lines(src_fname, encoding) assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'line count does not match...' with open(src_fname, 'r', encoding=encoding) as src_file, \ open(trg_fname, 'r', encoding=encoding) as trg_file, \ open(src_fname+suffix, 'w', encoding=encoding) as src_remove, \ open(trg_fname+suffix, 'w', encoding=encoding) as trg_remove: for src_line, trg_line in zip(src_file, trg_file): if short <= len(src_line.split()) <= long and short <= len(trg_line.split()) <= long: src_remove.write(src_line) trg_remove.write(trg_line) print('=================================================') after_lines_cnt = reader.count_lines(src_fname+suffix, encoding) assert after_lines_cnt == reader.count_lines(trg_fname+suffix, encoding), 'line count does not match...' print('before lines count:', raw_lines_cnt) print('after remove:', after_lines_cnt) print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
def remove_len_ratio(src_fname, trg_fname, low, high, encoding): suffix = '.remove_low'+str(low)+'_high'+str(high) raw_lines_cnt = reader.count_lines(src_fname, encoding) assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'line count does not match...' with open(src_fname, 'r', encoding=encoding) as src_file, \ open(trg_fname, 'r', encoding=encoding) as trg_file, \ open(src_fname+suffix, 'w', encoding=encoding) as src_remove, \ open(trg_fname+suffix, 'w', encoding=encoding) as trg_remove: for src_line, trg_line in zip(src_file, trg_file): sen_len_ratio = len(trg_line.split()) / len(src_line.split()) if sen_len_ratio < low or sen_len_ratio > high: continue else: src_remove.write(src_line) trg_remove.write(trg_line) print('=================================================') after_lines_cnt = reader.count_lines(src_fname+suffix, encoding) assert after_lines_cnt == reader.count_lines(trg_fname+suffix, encoding), 'line count does not match...' print('before lines count:', raw_lines_cnt) print('after remove:', after_lines_cnt) print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
def remove_same_src_trg(src_fname, trg_fname, encoding): raw_lines_cnt = reader.count_lines(src_fname, encoding) assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'lines count does not match...' src_after = src_fname+'.removesame' trg_after = trg_fname+'.removesame' with open(src_fname, 'r', encoding=encoding) as src_file, \ open(trg_fname, 'r', encoding=encoding) as trg_file, \ open(src_after, 'w', encoding=encoding) as src_after_file, \ open(trg_after, 'w', encoding=encoding) as trg_after_file: distinct = set() for src_line, trg_line in zip(src_file, trg_file): if src_line+trg_line not in distinct: distinct.add(src_line+trg_line) src_after_file.write(src_line) trg_after_file.write(trg_line) print('=================================================') after_lines_cnt = reader.count_lines(src_after, encoding) assert after_lines_cnt == reader.count_lines(trg_after, encoding), 'lines count does not match...' print('before lines count:', raw_lines_cnt) print('after remove:', after_lines_cnt) print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
def main(args): current_func_name = sys._getframe().f_code.co_name if args.raw_fname == args.seg_fname: print('\n======== In', current_func_name, '========') print('raw and segmented file are the same') print('nothing to do') return with open(args.raw_fname, 'r', encoding=args.encoding) as raw_file, \ open(args.seg_fname, 'w', encoding=args.encoding) as seg_file: line_cnt = 0 for line in raw_file: to_write = ' '.join(segment_sen(line, args.char_level, args.noHMM)) + '\n' seg_file.write(to_write) line_cnt += 1 if line_cnt % 100000 == 0: print(line_cnt, 'lines have been processed.') print('=================================================') print(line_cnt, 'lines have been processed finally.') print('raw file line count:', reader.count_lines(args.raw_fname, args.encoding)) print('segmented file line count:', reader.count_lines(args.seg_fname, args.encoding))
def stat_sens_distribution(corpus_fname, short, long, encoding='utf-8'): lines_cnt = reader.count_lines(corpus_fname, encoding) print( '=================== sentences distribution ========================') len_stat = {} with open(corpus_fname, 'r', encoding=encoding) as file: for line in file: sen_len = len(line.split()) if sen_len < short or sen_len > long: print(line) sen_len = math.ceil(sen_len / 10) * 10 if len_stat.get(sen_len) is not None: len_stat[sen_len] += 1 else: len_stat[sen_len] = 1 len_stat = list(len_stat.items()) assert lines_cnt == sum(map(lambda x: x[1], len_stat)) len_stat.sort(key=lambda x: x[0]) len_stat = dict(len_stat) for length, cnt in len_stat.items(): print('[' + str(length - 9) + ', ' + str(length) + ']:', cnt) tools.plot_bargraph('sentence length distribution', len_stat)
def main(args): current_func_name = sys._getframe().f_code.co_name if args.train_data in [args.split_train_dir, args.split_heldout_dir]: print('\n======== In', current_func_name, '========') print('Raw data and split data path are the same.') print('No split.') return total_train_line_cnt = reader.count_lines(args.train_data, args.encoding) train_data = args.train_data if os.path.isfile(train_data): train_data = [train_data] elif os.path.isdir(train_data): train_data = tools.get_fnames_under_path(train_data) if not os.path.exists(args.split_train_dir): os.makedirs(args.split_train_dir) if not os.path.exists(args.split_heldout_dir): os.makedirs(args.split_heldout_dir) if args.test_samples_num != total_train_line_cnt: tools.del_file_under_path(args.split_train_dir) if args.test_samples_num != 0: tools.del_file_under_path(args.split_heldout_dir) split_train_files = [] for i in range(args.split_train_num): train_split_file = open(os.path.join(args.split_train_dir, 'train-' + str(i)), 'w', encoding=args.encoding) split_train_files.append(train_split_file) split_heldout_files = [] for i in range(args.split_heldout_num): heldout_split_file = open(os.path.join(args.split_heldout_dir, 'heldout-' + str(i)), 'w', encoding=args.encoding) split_heldout_files.append(heldout_split_file) def random_togo(num): suffix = round(rdm.rand() * num) suffix = 0 if suffix < 0 else suffix suffix = num - 1 if suffix >= num else suffix return suffix test_ratio = args.test_samples_num / total_train_line_cnt line_cnt = 0 for fname in train_data: with open(fname, 'r', encoding=args.encoding) as file: for line in file: if rdm.rand() < test_ratio: split_heldout_files[random_togo( args.split_heldout_num)].write(line) else: split_train_files[random_togo( args.split_train_num)].write(line) line_cnt += 1 if line_cnt % 10000 == 0: print(line_cnt, 'lines have been processed.') print('=================================================') print(line_cnt, 'lines have been processed finally.') for file in split_train_files + split_heldout_files: file.close() print('split train data total samples count:', reader.count_lines(args.split_train_dir, args.encoding)) print('split heldout data total samples count:', reader.count_lines(args.split_heldout_dir, args.encoding))