elif len(result) > 1 and word=="t" and result[-1]=="'" and result[-2]=="n": result.pop() result[-1]="n't" ''' def split_and_convert_to_ints(words_uncorrected,edits): words_uncorrected = words_uncorrected.split(' ') edits = edits.split(' ')[0:len(words_uncorrected)] edits = list(map(int, edits)) return words_uncorrected, edits if __name__=="__main__": corrected = [] pretty.pheader('Reading Input') edits = read_file_lines(config.INPUT_EDITS) #uncorrected = read_file_lines(config.INPUT_UNCORRECTED) words_uncorrected = read_file_lines(config.INPUT_UNCORRECTED_WORDS) if len(edits) != len(words_uncorrected): pretty.fail('FATAL ERROR: Lengths of edits and uncorrected files not equal') exit() pretty.pheader('Splitting and converting to integers') if not DO_PARALLEL: for i in tqdm(range(len(edits))): edits[i] = list(map(int, edits[i].split(' '))) #uncorrected[i] = list(map(int, uncorrected[i].split(' ')))
else: # check for replace opration of transformation match failed if q_gram in opcodes.REP: edits[-1] = opcodes.REP[q_gram] else: # replacement with q_gram is not supported # we ignore the replacement and UNDO delete by having edits[-1] as COPY edits[-1] = opcodes.CPY else: # since inserts are merged in diffs, edits[-1] is either a CPY or a DEL, if op[0] == "+" print("This should never occour") exit(1) return edits pretty.pheader('Reading Input') incorrect_lines_generator = generator_based_read_file(FLAGS.incorr_sents, 'incorrect lines') correct_lines_generator = generator_based_read_file(FLAGS.correct_sents, 'correct lines') with open(FLAGS.incorr_tokens, "w") as ic_toks, \ open(FLAGS.correct_tokens, "w") as c_toks, \ open(FLAGS.incorr_token_ids, "w") as ic_tok_ids, \ open(FLAGS.edit_ids, "w") as e_ids: for incorrect_lines, correct_lines in zip(incorrect_lines_generator, correct_lines_generator): processed = Parallel(n_jobs=-1)(delayed(seq2edits)(*s) for s in tqdm( zip(incorrect_lines, correct_lines), total=len(incorrect_lines))) processed = [p for p in processed if p]
tokens = custom_tokenize(line, wordpiece_tokenizer) token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens) #print(tokens) #print(token_ids) return tokens, token_ids def write_output(raw_lines, tokens_file, token_ids_file): tuples = Parallel(n_jobs=1)(delayed(get_tuple)(raw_lines[i]) for i in tqdm(range(len(raw_lines)))) for i in range(len(tuples)): tokens, token_ids = tuples[i] # Write text output tokens_file.write(' '.join(tokens)) token_ids_file.write(' '.join(str(x) for x in token_ids)) tokens_file.write('\n') token_ids_file.write('\n') return if __name__ == "__main__": incorrect_lines = read_file_lines(FLAGS.input, 'incorrect lines') with open_w(FLAGS.output_tokens) as tokens_file,\ open_w(FLAGS.output_token_ids) as token_ids_file: pretty.pheader('Tokenizing Incorrect sentences') write_output(incorrect_lines, tokens_file, token_ids_file)