Exemple #1
0
    elif len(result) > 1 and word=="t" and result[-1]=="'" and result[-2]=="n":
        result.pop()
        result[-1]="n't"
    '''
def split_and_convert_to_ints(words_uncorrected,edits):
    words_uncorrected = words_uncorrected.split(' ')
    edits = edits.split(' ')[0:len(words_uncorrected)]
    edits = list(map(int, edits))    
    return words_uncorrected, edits 


if __name__=="__main__":

    corrected = []

    pretty.pheader('Reading Input')
    edits = read_file_lines(config.INPUT_EDITS)
    #uncorrected = read_file_lines(config.INPUT_UNCORRECTED)
    words_uncorrected = read_file_lines(config.INPUT_UNCORRECTED_WORDS)

    if len(edits) != len(words_uncorrected):
        pretty.fail('FATAL ERROR: Lengths of edits and uncorrected files not equal')
        exit()

       
    pretty.pheader('Splitting and converting to integers')

    if not DO_PARALLEL:
        for i in tqdm(range(len(edits))):
            edits[i] = list(map(int, edits[i].split(' ')))
            #uncorrected[i] = list(map(int, uncorrected[i].split(' ')))
Exemple #2
0
                else:
                    # check for replace opration of transformation match failed
                    if q_gram in opcodes.REP:
                        edits[-1] = opcodes.REP[q_gram]
                    else:
                        # replacement with q_gram is not supported
                        # we ignore the replacement and UNDO delete by having edits[-1] as COPY
                        edits[-1] = opcodes.CPY
            else:
                # since inserts are merged in diffs, edits[-1] is either a CPY or a DEL, if op[0] == "+"
                print("This should never occour")
                exit(1)
    return edits


pretty.pheader('Reading Input')
incorrect_lines_generator = generator_based_read_file(FLAGS.incorr_sents,
                                                      'incorrect lines')
correct_lines_generator = generator_based_read_file(FLAGS.correct_sents,
                                                    'correct lines')

with open(FLAGS.incorr_tokens, "w") as ic_toks, \
        open(FLAGS.correct_tokens, "w") as c_toks, \
        open(FLAGS.incorr_token_ids, "w") as ic_tok_ids, \
        open(FLAGS.edit_ids, "w") as e_ids:
    for incorrect_lines, correct_lines in zip(incorrect_lines_generator,
                                              correct_lines_generator):
        processed = Parallel(n_jobs=-1)(delayed(seq2edits)(*s) for s in tqdm(
            zip(incorrect_lines, correct_lines), total=len(incorrect_lines)))

        processed = [p for p in processed if p]
Exemple #3
0
    tokens = custom_tokenize(line, wordpiece_tokenizer)
    token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens)
    #print(tokens)
    #print(token_ids)
    return tokens, token_ids


def write_output(raw_lines, tokens_file, token_ids_file):
    tuples = Parallel(n_jobs=1)(delayed(get_tuple)(raw_lines[i])
                                for i in tqdm(range(len(raw_lines))))

    for i in range(len(tuples)):
        tokens, token_ids = tuples[i]

        # Write text output
        tokens_file.write(' '.join(tokens))
        token_ids_file.write(' '.join(str(x) for x in token_ids))

        tokens_file.write('\n')
        token_ids_file.write('\n')
    return


if __name__ == "__main__":
    incorrect_lines = read_file_lines(FLAGS.input, 'incorrect lines')
    with open_w(FLAGS.output_tokens) as tokens_file,\
        open_w(FLAGS.output_token_ids) as token_ids_file:

        pretty.pheader('Tokenizing Incorrect sentences')
        write_output(incorrect_lines, tokens_file, token_ids_file)