Beispiel #1
0
def process(zh, en, zh_vocab, en_vocab, writer_name):
    assert len(zh) == len(en), 'bad data set'
    writer = tf.python_io.TFRecordWriter(writer_name)
    for i in range(len(zh)):
        # print(i)
        zh_line = sentence2int(chinese_parse(zh[i].strip()), zh_vocab)
        en_line = sentence2int(english_parse(en[i].strip()), en_vocab)
        example = _convert_to_example(zh_line, en_line)
        writer.write(example.SerializeToString())
def build_partial_vocab_dict(partial_lines, filename):
    vocab_count = {}
    maxLen = 0
    if filename.split('.')[-2] == 'zh':
        parse = lambda x: chinese_parse(x)
    else:
        parse = lambda x: english_parse(x)
    # parse  = lambda x : pynlpir.segment(x,pos_tagging=False)
    for i, line in enumerate(partial_lines):
        try:
            line = parse(line)
        except:
            print('unicode error line ' + str(i + 1))
        if len(line) > maxLen:
            maxLen = len(line)
        for word in line:
            if word not in vocab_count:
                # print(word)
                vocab_count[word] = 1
            else:
                vocab_count[word] += 1
    return vocab_count, maxLen
Beispiel #3
0
def en_process(en, num_threads, thread_id, i, en_vocab):
    return sentence2int(english_parse(en[i * num_threads + thread_id].strip()), en_vocab)