Esempio n. 1
0
def len_ratio(src_fname, trg_fname, low, high, encoding):
    lines_cnt = reader.count_lines(src_fname, encoding)
    assert lines_cnt == reader.count_lines(
        trg_fname, encoding), 'line count does not match...'
    ratio_stat = {}
    illegal_cnt = 0
    with open(src_fname, 'r', encoding=encoding) as src_file, \
            open(trg_fname, 'r', encoding=encoding) as trg_file:
        for src_line, trg_line in zip(src_file, trg_file):
            sen_len_ratio = len(trg_line.split()) / len(src_line.split())
            if sen_len_ratio < low or sen_len_ratio > high:
                illegal_cnt += 1
                print(src_line, trg_line)
            sen_len_ratio = math.ceil(sen_len_ratio * 10) / 10
            if ratio_stat.get(sen_len_ratio) is not None:
                ratio_stat[sen_len_ratio] += 1
            else:
                ratio_stat[sen_len_ratio] = 1
    ratio_stat = list(ratio_stat.items())
    assert lines_cnt == sum(map(lambda x: x[1], ratio_stat))
    ratio_stat.sort(key=lambda x: x[0])
    ratio_stat = dict(ratio_stat)
    for ratio, cnt in ratio_stat.items():
        print('(' + str(ratio - 0.1) + ', ' + str(ratio) + ']', cnt)
    print('=================================================')
    print('low =', low, '| high =', high)
    print('illegal sentences count:', illegal_cnt)
    tools.plot_bargraph('trg len ratio to src len distribution', ratio_stat)
Esempio n. 2
0
def segment_src_trg(src_fname, trg_fname, src_seg, trg_seg, char_level, encoding):
    assert src_fname != src_seg, 'source raw and segmented filename are the same'
    assert trg_fname != trg_seg, 'target raw and segmented filename are the same'
    with open(src_fname, 'r', encoding=encoding) as src_file, \
            open(trg_fname, 'r', encoding=encoding) as trg_file, \
            open(src_seg, 'w', encoding=encoding) as src_seg_file, \
            open(trg_seg, 'w', encoding=encoding) as trg_seg_file:
        lines_cnt = 0
        for src_line, trg_line in zip(src_file, trg_file):
            src_to_write = ' '.join(segment_sen(src_line, char_level)) + '\n'
            trg_to_write = ' '.join(segment_sen(trg_line, char_level)) + '\n'
            if empty_pattern.match(src_to_write) or src_to_write in ['']:
                continue
            if empty_pattern.match(trg_to_write) or trg_to_write in ['']:
                continue
            src_seg_file.write(src_to_write)
            trg_seg_file.write(trg_to_write)
            lines_cnt += 1
            if lines_cnt % 100000 == 0:
                print(lines_cnt, 'lines have been processed.')

    print('=================================================')
    print(lines_cnt, 'lines have been processed finally.')
    print('raw file lines count:', reader.count_lines(src_fname, encoding))
    segmented_lines_cnt = reader.count_lines(src_seg, encoding)
    assert segmented_lines_cnt == reader.count_lines(trg_seg, encoding), 'segmented lines count does not match...'
    print('segmented file lines count:', segmented_lines_cnt)
Esempio n. 3
0
    def setup(self, hyperparams, corpus_params):
        self.pretrained_word_vecs_fname = corpus_params.pretrained_word_vecs_url
        self.raw_url = corpus_params.raw_url
        self.train_fname = corpus_params.train_url
        self.val_fname = corpus_params.val_url
        self.test_fname = corpus_params.test_url
        self.open_encoding = corpus_params.open_file_encoding
        self.save_encoding = corpus_params.save_file_encoding
        reader.split_train_val_test(self.raw_url,
                                    self.train_fname, self.val_fname, self.test_fname,
                                    self.open_encoding, self.save_encoding)

        run_which_model = model_name_full_abbr[self.__class__.__name__]
        corpus_name = corpus_name_full_abbr[corpus_params.__class__.__name__]
        setup_time = tools.get_current_time()
        self.this_model_save_dir = \
            params.RESULT_SAVE_DIR + os.path.sep + \
            run_which_model + '_' + corpus_name + '_' + setup_time
        if not os.path.exists(self.this_model_save_dir):
            os.makedirs(self.this_model_save_dir)

        self.hyperparams = hyperparams
        self.mode = hyperparams.mode
        self.keep_word_num = hyperparams.keep_word_num
        self.word_vec_dim = hyperparams.word_vec_dim
        self.time_step = hyperparams.time_step
        self.batch_size = hyperparams.batch_size
        self.tokenizer = reader.fit_tokenizer(self.raw_url, self.keep_word_num,
                                              hyperparams.filters, hyperparams.oov_tag,
                                              hyperparams.char_level,
                                              self.open_encoding)

        max_id = max(map(lambda x: x[1], list(self.tokenizer.word_index.items())))
        self.vocab_size = min(max_id, self.keep_word_num)

        self.pad = self.hyperparams.pad
        self.cut = self.hyperparams.cut

        self.total_samples_count = reader.count_lines(self.raw_url, self.open_encoding)
        self.train_samples_count = reader.count_lines(self.train_fname, self.open_encoding)
        self.val_samples_count = reader.count_lines(self.val_fname, self.open_encoding)
        self.test_samples_count = reader.count_lines(self.test_fname, self.open_encoding)

        record_info = list()
        record_info.append('\n================ In setup ================\n')
        record_info.append('Vocab size: %d\n' % self.vocab_size)
        record_info.append('Total samples count: %d\n' % self.total_samples_count)
        record_info.append('Train samples count: %d\n' % self.train_samples_count)
        record_info.append('Val samples count: %d\n' % self.val_samples_count)
        record_info.append('Test samples count: %d\n' % self.test_samples_count)
        record_str = ''.join(record_info)
        record_url = self.this_model_save_dir + os.path.sep + params.TRAIN_RECORD_FNAME
        tools.print_save_str(record_str, record_url)
Esempio n. 4
0
def sim_two_files(first_file, second_file, show_diff, encoding):
    fir_lines_cnt = reader.count_lines(first_file, encoding)
    sec_lines_cnt = reader.count_lines(second_file, encoding)
    same_count = 0
    with open(first_file, 'r', encoding=encoding) as first, \
            open(second_file, 'r', encoding=encoding) as second:
        sec_lines = set(second.read().splitlines())
        for fir_line in tqdm(first):
            if fir_line.replace('\n', '') in sec_lines:
                same_count += 1
            elif show_diff:
                print(fir_line)
    print('similarity:', str(same_count / fir_lines_cnt), '|',
          str(same_count / sec_lines_cnt))
Esempio n. 5
0
def remove_same(raw_fname, after_fname, encoding):
    assert raw_fname != after_fname, 'raw and trg filename are the same'
    with open(raw_fname, 'r', encoding=encoding) as raw_file, \
            open(after_fname, 'w', encoding=encoding) as after_file:
        distinct = set()
        for line in raw_file:
            if line not in distinct:
                distinct.add(line)
                after_file.write(line)
    print('=================================================')
    raw_lines_cnt = reader.count_lines(raw_fname, encoding)
    after_lines_cnt = reader.count_lines(after_fname, encoding)
    print('before lines count:', raw_lines_cnt)
    print('after remove:', after_lines_cnt)
    print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt)/raw_lines_cnt*100))
Esempio n. 6
0
def main(args):
    prefix = re.split('/+', args.all_src_fname)[-1].replace('.src', '')
    train_src = prefix + '.train.tok.src'
    train_trg = prefix + '.train.tok.trg'
    dev_src = prefix + '.dev.tok.src'
    dev_trg = prefix + '.dev.tok.trg'

    all_train_src = args.all_src_fname
    all_train_trg = args.all_trg_fname
    train_src = os.path.join(args.output_dir, train_src)
    train_trg = os.path.join(args.output_dir, train_trg)
    dev_src = os.path.join(args.output_dir, dev_src)
    dev_trg = os.path.join(args.output_dir, dev_trg)

    all_train_lines_cnt = reader.count_lines(all_train_src, args.encoding)
    dev_ratio = args.dev_samples_num / all_train_lines_cnt

    with open(all_train_src, 'r', encoding=args.encoding) as all_src_file, \
            open(all_train_trg, 'r', encoding=args.encoding) as all_trg_file, \
            open(train_src, 'w', encoding=args.encoding) as train_src_file, \
            open(train_trg, 'w', encoding=args.encoding) as train_trg_file, \
            open(dev_src, 'w', encoding=args.encoding) as dev_src_file, \
            open(dev_trg, 'w', encoding=args.encoding) as dev_trg_file:
        line_cnt = 0
        for src_line, trg_line in zip(all_src_file, all_trg_file):
            # remove empty src or target sentence pairs
            if src_line in ['', '\n'] or trg_line in ['', '\n']:
                continue
            if rdm.rand() < dev_ratio:
                dev_src_file.write(src_line)
                dev_trg_file.write(trg_line)
            else:
                train_src_file.write(src_line)
                train_trg_file.write(trg_line)
            line_cnt += 1
            if line_cnt % 100000 == 0:
                print(line_cnt, 'lines have been processed.')

    print('=================================================')
    print(line_cnt, 'lines have been processed finally.')
    print('train src data lines count:',
          reader.count_lines(train_src, args.encoding))
    print('train trg data lines count:',
          reader.count_lines(train_trg, args.encoding))
    print('dev src data lines count:',
          reader.count_lines(dev_src, args.encoding))
    print('dev trg data lines count:',
          reader.count_lines(dev_trg, args.encoding))
Esempio n. 7
0
def remove_long_short(src_fname, trg_fname, short, long, encoding):
    suffix = '.remove_short'+str(short)+'_long'+str(long)
    raw_lines_cnt = reader.count_lines(src_fname, encoding)
    assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'line count does not match...'
    with open(src_fname, 'r', encoding=encoding) as src_file, \
            open(trg_fname, 'r', encoding=encoding) as trg_file, \
            open(src_fname+suffix, 'w', encoding=encoding) as src_remove, \
            open(trg_fname+suffix, 'w', encoding=encoding) as trg_remove:
        for src_line, trg_line in zip(src_file, trg_file):
            if short <= len(src_line.split()) <= long and short <= len(trg_line.split()) <= long:
                src_remove.write(src_line)
                trg_remove.write(trg_line)
    print('=================================================')
    after_lines_cnt = reader.count_lines(src_fname+suffix, encoding)
    assert after_lines_cnt == reader.count_lines(trg_fname+suffix, encoding), 'line count does not match...'
    print('before lines count:', raw_lines_cnt)
    print('after remove:', after_lines_cnt)
    print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
Esempio n. 8
0
def remove_len_ratio(src_fname, trg_fname, low, high, encoding):
    suffix = '.remove_low'+str(low)+'_high'+str(high)
    raw_lines_cnt = reader.count_lines(src_fname, encoding)
    assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'line count does not match...'
    with open(src_fname, 'r', encoding=encoding) as src_file, \
            open(trg_fname, 'r', encoding=encoding) as trg_file, \
            open(src_fname+suffix, 'w', encoding=encoding) as src_remove, \
            open(trg_fname+suffix, 'w', encoding=encoding) as trg_remove:
        for src_line, trg_line in zip(src_file, trg_file):
            sen_len_ratio = len(trg_line.split()) / len(src_line.split())
            if sen_len_ratio < low or sen_len_ratio > high:
                continue
            else:
                src_remove.write(src_line)
                trg_remove.write(trg_line)
    print('=================================================')
    after_lines_cnt = reader.count_lines(src_fname+suffix, encoding)
    assert after_lines_cnt == reader.count_lines(trg_fname+suffix, encoding), 'line count does not match...'
    print('before lines count:', raw_lines_cnt)
    print('after remove:', after_lines_cnt)
    print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
Esempio n. 9
0
def remove_same_src_trg(src_fname, trg_fname, encoding):
    raw_lines_cnt = reader.count_lines(src_fname, encoding)
    assert raw_lines_cnt == reader.count_lines(trg_fname, encoding), 'lines count does not match...'
    src_after = src_fname+'.removesame'
    trg_after = trg_fname+'.removesame'
    with open(src_fname, 'r', encoding=encoding) as src_file, \
            open(trg_fname, 'r', encoding=encoding) as trg_file, \
            open(src_after, 'w', encoding=encoding) as src_after_file, \
            open(trg_after, 'w', encoding=encoding) as trg_after_file:
        distinct = set()
        for src_line, trg_line in zip(src_file, trg_file):
            if src_line+trg_line not in distinct:
                distinct.add(src_line+trg_line)
                src_after_file.write(src_line)
                trg_after_file.write(trg_line)
    print('=================================================')
    after_lines_cnt = reader.count_lines(src_after, encoding)
    assert after_lines_cnt == reader.count_lines(trg_after, encoding), 'lines count does not match...'
    print('before lines count:', raw_lines_cnt)
    print('after remove:', after_lines_cnt)
    print('remove %.2f%% data' % ((raw_lines_cnt - after_lines_cnt) / raw_lines_cnt * 100))
Esempio n. 10
0
def main(args):
    current_func_name = sys._getframe().f_code.co_name
    if args.raw_fname == args.seg_fname:
        print('\n======== In', current_func_name, '========')
        print('raw and segmented file are the same')
        print('nothing to do')
        return
    with open(args.raw_fname, 'r', encoding=args.encoding) as raw_file, \
            open(args.seg_fname, 'w', encoding=args.encoding) as seg_file:
        line_cnt = 0
        for line in raw_file:
            to_write = ' '.join(segment_sen(line, args.char_level,
                                            args.noHMM)) + '\n'
            seg_file.write(to_write)
            line_cnt += 1
            if line_cnt % 100000 == 0:
                print(line_cnt, 'lines have been processed.')

    print('=================================================')
    print(line_cnt, 'lines have been processed finally.')
    print('raw file line count:',
          reader.count_lines(args.raw_fname, args.encoding))
    print('segmented file line count:',
          reader.count_lines(args.seg_fname, args.encoding))
Esempio n. 11
0
def stat_sens_distribution(corpus_fname, short, long, encoding='utf-8'):
    lines_cnt = reader.count_lines(corpus_fname, encoding)
    print(
        '=================== sentences distribution ========================')
    len_stat = {}
    with open(corpus_fname, 'r', encoding=encoding) as file:
        for line in file:
            sen_len = len(line.split())
            if sen_len < short or sen_len > long:
                print(line)
            sen_len = math.ceil(sen_len / 10) * 10
            if len_stat.get(sen_len) is not None:
                len_stat[sen_len] += 1
            else:
                len_stat[sen_len] = 1
    len_stat = list(len_stat.items())
    assert lines_cnt == sum(map(lambda x: x[1], len_stat))
    len_stat.sort(key=lambda x: x[0])
    len_stat = dict(len_stat)
    for length, cnt in len_stat.items():
        print('[' + str(length - 9) + ', ' + str(length) + ']:', cnt)
    tools.plot_bargraph('sentence length distribution', len_stat)
Esempio n. 12
0
def main(args):
    current_func_name = sys._getframe().f_code.co_name
    if args.train_data in [args.split_train_dir, args.split_heldout_dir]:
        print('\n======== In', current_func_name, '========')
        print('Raw data and split data path are the same.')
        print('No split.')
        return

    total_train_line_cnt = reader.count_lines(args.train_data, args.encoding)
    train_data = args.train_data
    if os.path.isfile(train_data):
        train_data = [train_data]
    elif os.path.isdir(train_data):
        train_data = tools.get_fnames_under_path(train_data)

    if not os.path.exists(args.split_train_dir):
        os.makedirs(args.split_train_dir)
    if not os.path.exists(args.split_heldout_dir):
        os.makedirs(args.split_heldout_dir)
    if args.test_samples_num != total_train_line_cnt:
        tools.del_file_under_path(args.split_train_dir)
    if args.test_samples_num != 0:
        tools.del_file_under_path(args.split_heldout_dir)

    split_train_files = []
    for i in range(args.split_train_num):
        train_split_file = open(os.path.join(args.split_train_dir,
                                             'train-' + str(i)),
                                'w',
                                encoding=args.encoding)
        split_train_files.append(train_split_file)
    split_heldout_files = []
    for i in range(args.split_heldout_num):
        heldout_split_file = open(os.path.join(args.split_heldout_dir,
                                               'heldout-' + str(i)),
                                  'w',
                                  encoding=args.encoding)
        split_heldout_files.append(heldout_split_file)

    def random_togo(num):
        suffix = round(rdm.rand() * num)
        suffix = 0 if suffix < 0 else suffix
        suffix = num - 1 if suffix >= num else suffix
        return suffix

    test_ratio = args.test_samples_num / total_train_line_cnt
    line_cnt = 0
    for fname in train_data:
        with open(fname, 'r', encoding=args.encoding) as file:
            for line in file:
                if rdm.rand() < test_ratio:
                    split_heldout_files[random_togo(
                        args.split_heldout_num)].write(line)
                else:
                    split_train_files[random_togo(
                        args.split_train_num)].write(line)
                line_cnt += 1
                if line_cnt % 10000 == 0:
                    print(line_cnt, 'lines have been processed.')
    print('=================================================')
    print(line_cnt, 'lines have been processed finally.')

    for file in split_train_files + split_heldout_files:
        file.close()

    print('split train data total samples count:',
          reader.count_lines(args.split_train_dir, args.encoding))
    print('split heldout data total samples count:',
          reader.count_lines(args.split_heldout_dir, args.encoding))