def __getitem__(self, idx):
        try:
            x = next(
                pd.read_csv(self.filename,
                            skiprows=idx * self.chunksize + 1,
                            chunksize=self.chunksize,
                            header=None,
                            dtype=str)).fillna(NO_CONTEXT_WORD).values

            # something is broken here so just give filler
            if len(x[0]) != self.num_cols:
                # idx = max(0, idx-1)
                return self.__getitem__(np.random.randint(0, self.len))
        except:
            x = next(
                pd.read_csv(self.filename,
                            skiprows=idx * self.chunksize + 1,
                            chunksize=self.chunksize,
                            header=None,
                            sep=',\s+',
                            quoting=csv.QUOTE_ALL,
                            dtype=str)).fillna(NO_CONTEXT_WORD).values

            x = np.array(fix_quote_strings(x[0, 0]))

        x_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 0]),
                                     self.max_dim)
        y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]),
                                     self.max_dim)

        # x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens]
        # y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens]

        return x_tokens, y_tokens
def output_file(pickle_path):
    # for pickle_path in tqdm(tr_files, total = len(tr_files)):
    #     with open(str(pickle_path), 'rb') as f:
    #         result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]}
    #     f.close()
    write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train'

    df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD)
    df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
    # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
    df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
    max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words))

    try:
        ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist())))
        # skip sequences that are too long, because they use up memory

        ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex))
        # examples[(str(line).split('/')[-1], len(ex))] = ex
        result = {(str(pickle_path).split('/')[-1], len(ex)): ex}
        k = str(pickle_path).split('/')[-1].split('.')[0]

        k = list(result.keys())
        val = ex
        name, l = k[0]

        # try:
        new_vecs = None
        for batch in chunks(val, 32):  # loop over line numbers in file (get batches from file in order)
            # preprocess lines (includes tokenize_fine_grained
            # error checking and remove those lines from grabbing below
            # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match
            encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy()
            # for vec in encin:
            #     new_vecs.append(vec)
            new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin  # X --> x_i find closest in X

        ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski')  # n_jobs=32
        ne.fit(new_vecs)
        neighbors = ne.kneighbors()[1]
        new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1))
        for idx, row in enumerate(neighbors):
            filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5]
            retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]),
                                                  ' '.join(val[ret_idx].target_words)) for ret_idx in
                                                 filtered_idx]).values.flatten())  # .reshape(1, -1)

            full_line = pd.DataFrame(np.array(
                [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1))
            new_repo = pd.concat([new_repo, full_line], axis=0)
        # new_repo.head()

        new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None)

        # total_threads[0] = total_threads[0] - 1

    except Exception as e:
        print e
        print 'bad formatting in file ' + str(pickle_path).split('/')[-1]
        print pickle_path
    def words2tokens(self, x):
        x_tokens = preprocess_context(
            x, self.n_retrieved,
            self.max_dim) if self.retrieve_context else preprocess_tokens(
                tokenize_fine_grained(x[0, 0]), self.max_dim)
        y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]),
                                     self.max_dim)

        x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens]
        y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens]

        return x_tokens, y_tokens
    def __getitem__(self, idx):
        try:
            x = self.read_pandas_line(idx)

            # something is broken here so just give filler
            if len(x[0]) != self.num_cols:
                idx = max(0, idx - 1)
                return self.__getitem__(self.len - 1 if idx == 0 else idx)
        except:
            x = self.read_pandas_line_quote(idx)

            x = np.array(fix_quote_strings_context(x[0, 0], self.n_retrieved))

        query_x = [
            word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens(
                tokenize_fine_grained(x[0, 0]), self.max_dim)
        ]

        support_list_x = []
        support_list_y = []
        for i in range(self.n_retrieved):
            support_list_x.append([
                word2idx.get(token, UNKNOWN_IDX)
                for token in preprocess_tokens(
                    tokenize_fine_grained(x[0, i * 2 + 1]), self.max_dim)
            ])
            support_list_y.append([
                word2idx.get(token, UNKNOWN_IDX)
                for token in preprocess_tokens(
                    tokenize_fine_grained(x[0, i * 2 + 2]), self.max_dim)
            ])

        query_y = [
            word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens(
                tokenize_fine_grained(x[0, -1]), self.max_dim)
        ]

        support_x = torch.LongTensor(
            pd.DataFrame(support_x).values.astype('int64'))
        support_y = torch.LongTensor(
            pd.DataFrame(support_y).values.astype('int64'))

        query_x = torch.LongTensor(
            pd.DataFrame(query_x).values.astype('int64')).contiguous().view(
                1, -1)
        query_y = torch.LongTensor(
            pd.DataFrame(query_y).values.astype('int64')).contiguous().view(
                1, -1)

        return support_x, support_y, query_x, query_y
def threaded_tokenizer(lines, lock, tokens):
	for line in lines:
		line_tokens = tokenize_fine_grained(line)
		for token in line_tokens:
			if token in tokens: continue
			lock.acquire()
			tokens[token] = tokens.get(token, len(tokens))
			lock.release()
Beispiel #6
0
def threaded_tokenizer(lines, tokenize_lock, tokens, max_line_sz, filepath):
    for line in lines:
        line_tokens = tokenize_fine_grained(line)
        line_tokens = line_tokens[:MAX_TOKENS]
        # if len(line_tokens) > max_line_sz[0]:
        # max_line_sz[0] = len(line_tokens)
        # open(filepath+'max_line_size.txt', 'a').write(str(len(line_tokens)) + '\n')
        # acquire_lock = False
        for token in line_tokens:
            # if token in tokens:
            # tokenize_lock.acquire()
            # acquire_lock = True

            # tokenize_lock.acquire()
            tokens[token] = tokens.get(token, 0) + 1  #len(tokens))
            # if acquire_lock:
            # acquire_lock = False
            # tokenize_lock.release()
            # tokenize_lock.release()
Beispiel #7
0
 def examples_from_file(data_paths, seq_length_limit, fname):
     examples = {}
     MAX_LINE_LENGTH = 128
     name = '{}.pickle'.format(fname)
     file = pathlib2.Path.cwd(
     ) / 'github_data' / 'processed_repo_pkl' / name
     # if os.path.exists(str(file)):
     #     with open(str(file), 'rb') as f:
     #         examples = pickle.load(f)
     #     f.close()
     #     return list(examples.values())
     # count total lines before loading
     num_direct = len(data_paths)
     for line in verboserate(data_paths,
                             desc='Reading data file.',
                             total=num_direct):
         df = pd.read_csv(line,
                          skiprows=2,
                          header=None,
                          names=[0, 1],
                          dtype=str).fillna(NO_CONTEXT_WORD)
         df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
         # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
         df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
         try:
             ex = []
             for i, row in df.iterrows():
                 try:
                     ex.append(EditExample(row[0], row[1]))
                 except:
                     # print 'bad formatting in file ' + str(line).split('/')[-1]
                     # print line
                     count = 1
             # skip sequences that are too long, because they use up memory
             # if max_seq_length(ex) > seq_length_limit:
             #     continue
             ex = list(
                 ifilterfalse(
                     lambda x: max_seq_length(x) > seq_length_limit,
                     ex))
             # examples[(str(line).split('/')[-1], len(ex))] = ex
             file = pathlib2.Path.cwd(
             ) / 'github_data' / 'processed_repo_pkl' / fname
             result = {(str(line).split('/')[-1], len(ex)): ex}
             k = str(line).split('/')[-1].split('.')[0]
             pick_obj = {(str(line).split('/')[-1], len(ex)): ex}
             obj_name = str(file / k) + '.pickle'
             with open(obj_name, 'wb') as f:
                 pickle.dump(pick_obj, f)
             f.close()
         except Exception as e:
             print e
             print 'bad formatting in file ' + str(line).split('/')[-1]
             print line
     # name = '{}.pickle'.format(fname)
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name
     # if fname == 'train':
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname
     # for k, v in tqdm(examples.items()):
     #     obj_name = file / k[0].split('.')[0]
     #     pick_obj = {k : v}
     #     with open(str(obj_name), 'wb') as f:
     #         pickle.dump(pick_obj, f)
     #     f.close()
     # else:
     #     if not os.path.exists(str(file)):
     #         with open(str(file), 'wb') as f:
     #             pickle.dump(examples, f)
     #         f.close()
     return list(examples.values())