def process(zh, en, zh_vocab, en_vocab, writer_name): assert len(zh) == len(en), 'bad data set' writer = tf.python_io.TFRecordWriter(writer_name) for i in range(len(zh)): # print(i) zh_line = sentence2int(chinese_parse(zh[i].strip()), zh_vocab) en_line = sentence2int(english_parse(en[i].strip()), en_vocab) example = _convert_to_example(zh_line, en_line) writer.write(example.SerializeToString())
def build_partial_vocab_dict(partial_lines, filename): vocab_count = {} maxLen = 0 if filename.split('.')[-2] == 'zh': parse = lambda x: chinese_parse(x) else: parse = lambda x: english_parse(x) # parse = lambda x : pynlpir.segment(x,pos_tagging=False) for i, line in enumerate(partial_lines): try: line = parse(line) except: print('unicode error line ' + str(i + 1)) if len(line) > maxLen: maxLen = len(line) for word in line: if word not in vocab_count: # print(word) vocab_count[word] = 1 else: vocab_count[word] += 1 return vocab_count, maxLen
def en_process(en, num_threads, thread_id, i, en_vocab): return sentence2int(english_parse(en[i * num_threads + thread_id].strip()), en_vocab)