def examples_from_file(path, seq_length_limit):
            """Return list[EditExample] from file path."""
            examples = []

            # count total lines before loading
            total_lines = num_lines(path)

            with codecs.open(path, 'r', encoding='utf-8') as f:
                lnum = 0
                for line in verboserate(f,
                                        desc='Reading data file.',
                                        total=total_lines):
                    split = line.strip().split('\t')
                    lnum += 1
                    input_words = []
                    try:
                        for c in config.source_cols:
                            input_words.append(split[c].split(' '))
                        trg_words = split[config.target_col].split(
                            ' ')  # gold answer
                        assert len(trg_words) > 0
                        ex = EditExample(input_words, trg_words)
                        # skip sequences that are too long, because they use up memory
                        if max_seq_length(ex) > seq_length_limit:
                            continue
                        examples.append(ex)
                    except:
                        print 'bad formatting in line ' + str(lnum)
                        print line

            return examples
def edit():

    events = request.form['events']
    #print(events)
    
    events = [event.split() for event in events.split(',')]
    #print(events)
    
    processedEvents = [EditExample([[event[0]], [event[1]], event[2:]],['?']) for event in tqdm(events)]
    
    valid_eval = ret_model.ret_and_make_ex(processedEvents, new_lsh, examples.train, 0, train_mode=False)
    beam_list, edit_traces = edit_model.edit(valid_eval)

    # base retriever.
    import gtd.retrieval_func as rf
    lsh, dict = rf.make_hash(examples.train)
    output_index = rf.grab_nbs(processedEvents, lsh, dict)
    ret_pred = rf.generate_predictions(examples.train, output_index)

    ####
    # eval code
    gen_out = []
    for i in tqdm(range(len(edit_traces))):
        gen = beam_list[i][0]
        gen_out.append(gen)
        
    dist = []
    prob = []

    for i in tqdm(range(len(edit_traces))):
        dist.append(str(valid_eval[i].dist))
        prob.append(str(edit_traces[i].decoder_trace.candidates[0].prob))
    
    output = [gen_out, dist, prob]
    return str(json.dumps(output))
def output_file(pickle_path):
    # for pickle_path in tqdm(tr_files, total = len(tr_files)):
    #     with open(str(pickle_path), 'rb') as f:
    #         result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]}
    #     f.close()
    write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train'

    df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD)
    df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
    # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
    df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
    max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words))

    try:
        ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist())))
        # skip sequences that are too long, because they use up memory

        ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex))
        # examples[(str(line).split('/')[-1], len(ex))] = ex
        result = {(str(pickle_path).split('/')[-1], len(ex)): ex}
        k = str(pickle_path).split('/')[-1].split('.')[0]

        k = list(result.keys())
        val = ex
        name, l = k[0]

        # try:
        new_vecs = None
        for batch in chunks(val, 32):  # loop over line numbers in file (get batches from file in order)
            # preprocess lines (includes tokenize_fine_grained
            # error checking and remove those lines from grabbing below
            # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match
            encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy()
            # for vec in encin:
            #     new_vecs.append(vec)
            new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin  # X --> x_i find closest in X

        ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski')  # n_jobs=32
        ne.fit(new_vecs)
        neighbors = ne.kneighbors()[1]
        new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1))
        for idx, row in enumerate(neighbors):
            filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5]
            retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]),
                                                  ' '.join(val[ret_idx].target_words)) for ret_idx in
                                                 filtered_idx]).values.flatten())  # .reshape(1, -1)

            full_line = pd.DataFrame(np.array(
                [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1))
            new_repo = pd.concat([new_repo, full_line], axis=0)
        # new_repo.head()

        new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None)

        # total_threads[0] = total_threads[0] - 1

    except Exception as e:
        print e
        print 'bad formatting in file ' + str(pickle_path).split('/')[-1]
        print pickle_path
Beispiel #4
0
 def make_editexamples(self, proto_list, edit_list):
     example_list = []
     for i in range(len(proto_list)):
         el = EditExample(
             edit_list[0].input_words + proto_list[i].input_words +
             [proto_list[i].target_words], edit_list[0].target_words)
         example_list.append(el)
     return example_list
Beispiel #5
0
def make_eexs(inlist, outlist):
    fline = []
    for instr, outstr in zip(inlist, outlist):
        cardname = regex.sub('[\p{P}\p{Sm}]+', '',
                             ''.join(instr[0].split(' ')))
        i1 = [cardname] + instr[0].split(' ')
        i2 = instr[1:9]
        i3 = instr[9].split(' ')
        tmp = EditExample([i1, i2, i3], outstr)
        fline.append(tmp)
    return fline
def edit(events):

    x = "with enraged yells <PRP> repeatedly throws Synset('entity.n.01') at <ORGANIZATION>8 Synset('natural_phenomenon.n.01') that seals the Synset('action.n.01') startling <PERSON>14 the Synset('defender.n.01') on Synset('group_action.n.01')".split(
    )
    processedEvents = [
        EditExample([[event[0]], [event[1]], event[2:]], x) for event in events
    ]

    print(processedEvents[0])

    valid_eval = ret_model.ret_and_make_ex(processedEvents,
                                           new_lsh,
                                           examples.train,
                                           0,
                                           train_mode=False)
    beam_list, edit_traces = edit_model.edit(valid_eval)

    # base retriever.
    import gtd.retrieval_func as rf
    lsh, dict = rf.make_hash(examples.train)
    output_index = rf.grab_nbs(processedEvents, lsh, dict)
    ret_pred = rf.generate_predictions(examples.train, output_index)

    ####
    # eval code
    gen_out = []
    for i in range(len(edit_traces)):
        gen = beam_list[i][0]
        gen_out.append(gen)

    dist = []
    prob = []

    for i in range(len(edit_traces)):
        dist.append(str(valid_eval[i].dist))
        prob.append(str(edit_traces[i].decoder_trace.candidates[0].prob))

    output = {'output': gen_out, 'distances': dist, 'beamProb': prob}
    print(output)
    return output
Beispiel #7
0
    def interact(self, beam_size=8, constrain_vocab=False, verbose=True):
        ex = EditExample.from_prompt()
        beam_list, edit_traces = self.edit([ex],
                                           beam_size=beam_size,
                                           constrain_vocab=constrain_vocab)
        beam = beam_list[0]
        output_words = beam[0]
        edit_trace = edit_traces[0]

        # nll = lambda example: self.loss([example]).data[0]

        # TODO: make this fully generative in the right way.. current NLL is wrong, disabled for now.
        # compare NLL of correct output and predicted output
        # output_ex = EditExample(ex.source_words, ex.insert_words, ex.delete_words, output_words)
        # gold_nll = nll(ex)
        # output_nll = nll(output_ex)

        print 'output:'
        print ' '.join(output_words)

        if verbose:
            # print
            # print 'output NLL: {}, gold NLL: {}'.format(output_nll, gold_nll)
            print edit_trace
 def input_remapper(self, batch):
     flatten = lambda l: [item for sublist in l for item in sublist]
     return [EditExample(input_words=ex.input_words, target_words=flatten(ex.input_words)) for ex in batch]
Beispiel #9
0
 def examples_from_file(data_paths, seq_length_limit, fname):
     examples = {}
     MAX_LINE_LENGTH = 128
     name = '{}.pickle'.format(fname)
     file = pathlib2.Path.cwd(
     ) / 'github_data' / 'processed_repo_pkl' / name
     # if os.path.exists(str(file)):
     #     with open(str(file), 'rb') as f:
     #         examples = pickle.load(f)
     #     f.close()
     #     return list(examples.values())
     # count total lines before loading
     num_direct = len(data_paths)
     for line in verboserate(data_paths,
                             desc='Reading data file.',
                             total=num_direct):
         df = pd.read_csv(line,
                          skiprows=2,
                          header=None,
                          names=[0, 1],
                          dtype=str).fillna(NO_CONTEXT_WORD)
         df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
         # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
         df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
         try:
             ex = []
             for i, row in df.iterrows():
                 try:
                     ex.append(EditExample(row[0], row[1]))
                 except:
                     # print 'bad formatting in file ' + str(line).split('/')[-1]
                     # print line
                     count = 1
             # skip sequences that are too long, because they use up memory
             # if max_seq_length(ex) > seq_length_limit:
             #     continue
             ex = list(
                 ifilterfalse(
                     lambda x: max_seq_length(x) > seq_length_limit,
                     ex))
             # examples[(str(line).split('/')[-1], len(ex))] = ex
             file = pathlib2.Path.cwd(
             ) / 'github_data' / 'processed_repo_pkl' / fname
             result = {(str(line).split('/')[-1], len(ex)): ex}
             k = str(line).split('/')[-1].split('.')[0]
             pick_obj = {(str(line).split('/')[-1], len(ex)): ex}
             obj_name = str(file / k) + '.pickle'
             with open(obj_name, 'wb') as f:
                 pickle.dump(pick_obj, f)
             f.close()
         except Exception as e:
             print e
             print 'bad formatting in file ' + str(line).split('/')[-1]
             print line
     # name = '{}.pickle'.format(fname)
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name
     # if fname == 'train':
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname
     # for k, v in tqdm(examples.items()):
     #     obj_name = file / k[0].split('.')[0]
     #     pick_obj = {k : v}
     #     with open(str(obj_name), 'wb') as f:
     #         pickle.dump(pick_obj, f)
     #     f.close()
     # else:
     #     if not os.path.exists(str(file)):
     #         with open(str(file), 'wb') as f:
     #             pickle.dump(examples, f)
     #         f.close()
     return list(examples.values())