def __getitem__(self, idx): try: x = next( pd.read_csv(self.filename, skiprows=idx * self.chunksize + 1, chunksize=self.chunksize, header=None, dtype=str)).fillna(NO_CONTEXT_WORD).values # something is broken here so just give filler if len(x[0]) != self.num_cols: # idx = max(0, idx-1) return self.__getitem__(np.random.randint(0, self.len)) except: x = next( pd.read_csv(self.filename, skiprows=idx * self.chunksize + 1, chunksize=self.chunksize, header=None, sep=',\s+', quoting=csv.QUOTE_ALL, dtype=str)).fillna(NO_CONTEXT_WORD).values x = np.array(fix_quote_strings(x[0, 0])) x_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 0]), self.max_dim) y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]), self.max_dim) # x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens] # y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens] return x_tokens, y_tokens
def output_file(pickle_path): # for pickle_path in tqdm(tr_files, total = len(tr_files)): # with open(str(pickle_path), 'rb') as f: # result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]} # f.close() write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train' df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words)) try: ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist()))) # skip sequences that are too long, because they use up memory ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex result = {(str(pickle_path).split('/')[-1], len(ex)): ex} k = str(pickle_path).split('/')[-1].split('.')[0] k = list(result.keys()) val = ex name, l = k[0] # try: new_vecs = None for batch in chunks(val, 32): # loop over line numbers in file (get batches from file in order) # preprocess lines (includes tokenize_fine_grained # error checking and remove those lines from grabbing below # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy() # for vec in encin: # new_vecs.append(vec) new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin # X --> x_i find closest in X ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski') # n_jobs=32 ne.fit(new_vecs) neighbors = ne.kneighbors()[1] new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1)) for idx, row in enumerate(neighbors): filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5] retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]), ' '.join(val[ret_idx].target_words)) for ret_idx in filtered_idx]).values.flatten()) # .reshape(1, -1) full_line = pd.DataFrame(np.array( [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1)) new_repo = pd.concat([new_repo, full_line], axis=0) # new_repo.head() new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None) # total_threads[0] = total_threads[0] - 1 except Exception as e: print e print 'bad formatting in file ' + str(pickle_path).split('/')[-1] print pickle_path
def words2tokens(self, x): x_tokens = preprocess_context( x, self.n_retrieved, self.max_dim) if self.retrieve_context else preprocess_tokens( tokenize_fine_grained(x[0, 0]), self.max_dim) y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]), self.max_dim) x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens] y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens] return x_tokens, y_tokens
def __getitem__(self, idx): try: x = self.read_pandas_line(idx) # something is broken here so just give filler if len(x[0]) != self.num_cols: idx = max(0, idx - 1) return self.__getitem__(self.len - 1 if idx == 0 else idx) except: x = self.read_pandas_line_quote(idx) x = np.array(fix_quote_strings_context(x[0, 0], self.n_retrieved)) query_x = [ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, 0]), self.max_dim) ] support_list_x = [] support_list_y = [] for i in range(self.n_retrieved): support_list_x.append([ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, i * 2 + 1]), self.max_dim) ]) support_list_y.append([ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, i * 2 + 2]), self.max_dim) ]) query_y = [ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, -1]), self.max_dim) ] support_x = torch.LongTensor( pd.DataFrame(support_x).values.astype('int64')) support_y = torch.LongTensor( pd.DataFrame(support_y).values.astype('int64')) query_x = torch.LongTensor( pd.DataFrame(query_x).values.astype('int64')).contiguous().view( 1, -1) query_y = torch.LongTensor( pd.DataFrame(query_y).values.astype('int64')).contiguous().view( 1, -1) return support_x, support_y, query_x, query_y
def threaded_tokenizer(lines, lock, tokens): for line in lines: line_tokens = tokenize_fine_grained(line) for token in line_tokens: if token in tokens: continue lock.acquire() tokens[token] = tokens.get(token, len(tokens)) lock.release()
def threaded_tokenizer(lines, tokenize_lock, tokens, max_line_sz, filepath): for line in lines: line_tokens = tokenize_fine_grained(line) line_tokens = line_tokens[:MAX_TOKENS] # if len(line_tokens) > max_line_sz[0]: # max_line_sz[0] = len(line_tokens) # open(filepath+'max_line_size.txt', 'a').write(str(len(line_tokens)) + '\n') # acquire_lock = False for token in line_tokens: # if token in tokens: # tokenize_lock.acquire() # acquire_lock = True # tokenize_lock.acquire() tokens[token] = tokens.get(token, 0) + 1 #len(tokens)) # if acquire_lock: # acquire_lock = False # tokenize_lock.release() # tokenize_lock.release()
def examples_from_file(data_paths, seq_length_limit, fname): examples = {} MAX_LINE_LENGTH = 128 name = '{}.pickle'.format(fname) file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / name # if os.path.exists(str(file)): # with open(str(file), 'rb') as f: # examples = pickle.load(f) # f.close() # return list(examples.values()) # count total lines before loading num_direct = len(data_paths) for line in verboserate(data_paths, desc='Reading data file.', total=num_direct): df = pd.read_csv(line, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) try: ex = [] for i, row in df.iterrows(): try: ex.append(EditExample(row[0], row[1])) except: # print 'bad formatting in file ' + str(line).split('/')[-1] # print line count = 1 # skip sequences that are too long, because they use up memory # if max_seq_length(ex) > seq_length_limit: # continue ex = list( ifilterfalse( lambda x: max_seq_length(x) > seq_length_limit, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / fname result = {(str(line).split('/')[-1], len(ex)): ex} k = str(line).split('/')[-1].split('.')[0] pick_obj = {(str(line).split('/')[-1], len(ex)): ex} obj_name = str(file / k) + '.pickle' with open(obj_name, 'wb') as f: pickle.dump(pick_obj, f) f.close() except Exception as e: print e print 'bad formatting in file ' + str(line).split('/')[-1] print line # name = '{}.pickle'.format(fname) # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name # if fname == 'train': # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname # for k, v in tqdm(examples.items()): # obj_name = file / k[0].split('.')[0] # pick_obj = {k : v} # with open(str(obj_name), 'wb') as f: # pickle.dump(pick_obj, f) # f.close() # else: # if not os.path.exists(str(file)): # with open(str(file), 'wb') as f: # pickle.dump(examples, f) # f.close() return list(examples.values())