def _build_indices(cls, seq_embedder, sentences, n_trees, save_dir): makedirs(save_dir) # make sure directory exists batch_size = 128 # number of sentences to embed at a time batches_per_index = 4096 # number of batches per index # 128 * 4096 = 524288 sentences per index batches = list(chunks(sentences, n=batch_size)) sharded_batches = list(chunks(batches, n=batches_per_index)) num_shards = len(sharded_batches) embed_dim = seq_embedder.embed_dim for s, batches_s in enumerate(sharded_batches): print 'Building shard {}/{}'.format(s + 1, num_shards) index = cls._init_index(embed_dim) i = 0 for batch in verboserate( batches_s, desc='Embedding sentences (batch_size={})'.format( batch_size)): sent_embeds = seq_embedder.embed(batch) sent_embeds = sent_embeds.data.cpu().numpy( ) # (batch_size, embed_dim) for sent_embed in sent_embeds: # sent_embed has shape (embed_dim,) index.add_item(i, sent_embed) i += 1 with timer('Constructing trees'): index.build(n_trees) with timer('Saving shard to disk'): index.save(cls._shard_path(save_dir, s))
def similar_size_batches(examples, batch_size, size=lambda x: len(x.target_words)): """Create similar-sized batches of EditExamples. By default, elements with similar len('source_words') are batched together. See editor.py / EditExample. Args: examples (list[EditExample]) batch_size (int) size (Callable[[EditExample], int]) Returns: list[list[EditExample]] """ assert batch_size >= 1 sorted_examples = sorted(examples, key=size) batches = list(chunks(sorted_examples, batch_size)) random.shuffle(batches) # in-place # report savings suboptimal_batches = list(chunks(examples, batch_size)) total_cost = lambda batches: batch_size * sum(max(size(b) for b in batch) for batch in batches) naive_cost = total_cost(suboptimal_batches) improved_cost = total_cost(batches) optimal_cost = sum(size(ex) for ex in examples) print 'Optimized batches: reduced cost from {naive} (naive) to {improved} ({reduction}% reduction).\n' \ 'Optimal (batch_size=1) would be {optimal}.'.format(naive=naive_cost, improved=improved_cost, reduction=float(naive_cost - improved_cost) / naive_cost, optimal=optimal_cost) return batches
def _compute_metrics(cls, model, ts, examples, eval_size=1000, batch_size=256): examples_ = sample_if_large(examples, max_size=eval_size) losses, weights = [], [] for batch in chunks(examples_, batch_size): # compute loss batch_loss = model.loss(batch, ts) losses.append(batch_loss.data[0]) weights.append(len(batch)) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # compute perplexity entropy = 0.0 num_words = 0 for batch in chunks(examples_, batch_size): # change base losses = model.per_instance_losses(batch) # -log_e_x losses = losses.data.cpu().numpy() losses_log_2 = losses / np.log(2.0) # normalize log_p by sentence length lengths = np.array([len(ex) + 1 for ex in batch]) entropy += np.sum(losses_log_2) num_words += sum(lengths) pp = 2.0**(1.0 / num_words * entropy) return round(loss, 5), round(pp, 55)
def edit(self, examples, max_seq_length=150, beam_size=5, batch_size=64, constrain_vocab=False, verbose=False): """Performs edits on a batch of source sentences. Args: examples (list[EditExample]) max_seq_length (int): max # timesteps to generate for beam_size (int): for beam decoding batch_size (int): max number of examples to pass into the RNN decoder at a time. The total # examples decoded in parallel = batch_size / beam_size. constrain_vocab (bool): default is False Returns: beam_list (list[list[list[unicode]]]): a batch of beams. edit_traces (list[EditTrace]) """ self.eval() # set to evaluation mode, for dropout to work correctly beam_list = [] edit_traces = [] batches = chunks(examples, batch_size / beam_size) batches = verboserate(batches, desc='Decoding examples') if verbose else batches for batch in batches: beams, traces = self._edit_batch(batch, max_seq_length, beam_size, constrain_vocab) beam_list.extend(beams) edit_traces.extend(traces) self.train() # set back to train mode return beam_list, edit_traces
def _compute_metrics(cls, editor, examples, num_evaluate_examples, noiser, batch_size=256, edit_dropout=False, draw_samples=False): with random_seed(0): sample = sample_if_large(examples, num_evaluate_examples, replace=False) if edit_dropout: noised_sample = noiser(sample) else: noised_sample = sample # compute loss and log to TensorBoard # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, weights = [], [] for batch in chunks(noised_sample, batch_size): weights.append(len(batch)) loss_var, _, _ = editor.loss(batch, draw_samples) losses.append(loss_var.data[0]) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average # compute BLEU score and log to TensorBoard outputs, edit_traces = editor.edit(noised_sample) bleus = [] for ex, output in izip(noised_sample, outputs): # outputs is a list(over batches)[ list(over beams) [ list(over tokens) [ unicode ] ] ] object. bleus.append(bleu(ex.target_words, output[0])) avg_bleu = np.mean(bleus) return loss, avg_bleu, edit_traces
def output_file(pickle_path): # for pickle_path in tqdm(tr_files, total = len(tr_files)): # with open(str(pickle_path), 'rb') as f: # result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]} # f.close() write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train' df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words)) try: ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist()))) # skip sequences that are too long, because they use up memory ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex result = {(str(pickle_path).split('/')[-1], len(ex)): ex} k = str(pickle_path).split('/')[-1].split('.')[0] k = list(result.keys()) val = ex name, l = k[0] # try: new_vecs = None for batch in chunks(val, 32): # loop over line numbers in file (get batches from file in order) # preprocess lines (includes tokenize_fine_grained # error checking and remove those lines from grabbing below # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy() # for vec in encin: # new_vecs.append(vec) new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin # X --> x_i find closest in X ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski') # n_jobs=32 ne.fit(new_vecs) neighbors = ne.kneighbors()[1] new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1)) for idx, row in enumerate(neighbors): filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5] retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]), ' '.join(val[ret_idx].target_words)) for ret_idx in filtered_idx]).values.flatten()) # .reshape(1, -1) full_line = pd.DataFrame(np.array( [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1)) new_repo = pd.concat([new_repo, full_line], axis=0) # new_repo.head() new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None) # total_threads[0] = total_threads[0] - 1 except Exception as e: print e print 'bad formatting in file ' + str(pickle_path).split('/')[-1] print pickle_path
def ret_and_make_ex(self, input, lsh, ex_list, startat, train_mode=True): ret_list = [] for batch in chunks(input, 128): idxlist = self.ret_idx(batch, lsh, train_mode=train_mode) ret_tmp = [ex_list[idx[startat]] for idx in idxlist] ret_list.extend(ret_tmp) return self.make_editexamples(ret_list, input)
def batch_embed(self, exes, train_mode=True): ret_list = [] for batch in chunks(exes, 128): encin = self.encode(batch, train_mode).data.cpu().numpy() for vec in encin: ret_list.append(vec) return ret_list
def per_instance_losses(self, examples, draw_samples=False, batch_size=128): """Compute per-instance losses.""" per_instance_loss_list = [] for batch in chunks(examples, batch_size): editor_input = self.preprocess(batch) encoder_output = self.encoder(editor_input.encoder_input, draw_samples) ilosses = self.train_decoder.per_instance_losses(encoder_output, editor_input.train_decoder_input) per_instance_loss_list.extend([loss.data.cpu().numpy()[0] for loss in ilosses]) return per_instance_loss_list
def edit(self, examples, max_seq_length=35, beam_size=5, batch_size=1024): """Add one argument random_edit_vector wich enforce edition with a random vector.""" logging.debug("Performing an edit on {} examples:\n {}".format(len(examples), examples)) beam_list = [] edit_traces = [] for batch in chunks(examples, batch_size / beam_size): beams, traces = self._edit_batch(batch, max_seq_length, beam_size) beam_list.extend(beams) edit_traces.extend(traces) return beam_list, edit_traces
def _recover_sequences(cls, states_over_time, beam_size, top_k): # create decoder_traces ex_idx_to_beam_traces = defaultdict(list) for t, states in enumerate(states_over_time): assert len(states) % beam_size == 0 beams = list(chunks(states, beam_size)) for ex_idx, beam in enumerate(beams): trace = BeamTrace(beam, top_k) ex_idx_to_beam_traces[ex_idx].append(trace) decoder_traces = [] for ex_idx in range(max(ex_idx_to_beam_traces.keys()) + 1): beam_traces = ex_idx_to_beam_traces[ex_idx] decoder_traces.append(BeamDecoderTrace(beam_traces)) final_state_beams = list(chunks(states_over_time[-1], beam_size)) output_beams = [[state.token_sequence for state in state_beam] for state_beam in final_state_beams] return output_beams, decoder_traces
def ret_and_make_ex(self, input, lsh, ex_list, startat, train_mode=True): ret_list = [] dist_list = [] for batch in chunks(input, 128): idxlist, dist = self.ret_idx(batch, lsh, train_mode=train_mode) ret_tmp = [ex_list[idx[startat]] for idx in idxlist] dist_tmp = [d[0] for d in dist] ret_list.extend(ret_tmp) dist_list.extend(dist_tmp) edit_examples = self.make_editexamples(ret_list, input) for i, ex in enumerate(edit_examples): ex.dist = dist_list[i] return edit_examples
def launch(self, example_uids): """Launch task. Args: example_uids (list[str]): list of example_uids to launch the task with """ batches = list(chunks(example_uids, self._batch_size)) total_hits = len(batches) assert isinstance(self._price_per_hit, float) total_cost = total_hits * self._price_per_hit print('Launching {} HITs (${}). Type Enter to continue.'.format(total_hits, total_cost)) input() parallel_call(self.create_hit, batches)
def get_vectors(self, tset): """ :param tset: list of training examples :return: vec_list (joint encoding) and vec_list_in (context encoding) """ vec_list = [] vec_list_in = [] for titem in chunks(tset, 128): edit_proc = self.preprocess(titem, volatile=True) agenda_out = self.encoder.target_out(edit_proc.encoder_input) agenda_in, _ = self.encoder.ctx_code_out(edit_proc.encoder_input) amat = agenda_out.data.cpu().numpy() amat_in = agenda_in.data.cpu().numpy() for i in range(amat.shape[0]): avec = amat[i] + amat_in[i] anorm = np.linalg.norm(avec) vec_list.append(avec / anorm) vec_list_in.append(amat_in[i] / np.linalg.norm(amat_in[i])) return vec_list, vec_list_in
def edit(self, examples, max_seq_length=35, beam_size=10, batch_size=256): """Performs edits on a batch of source sentences. Args: examples (list[EditExample]) max_seq_length (int): max # timesteps to generate for beam_size (int): for beam decoding batch_size (int): max number of examples to pass into the RNN decoder at a time. The total # examples decoded in parallel = batch_size / beam_size. Returns: beam_list (list[list[list[unicode]]]): a batch of beams. edit_traces (list[EditTrace]) """ beam_list = [] edit_traces = [] for batch in chunks(examples, batch_size / beam_size): beams, traces = self._edit_batch(batch, max_seq_length, beam_size) beam_list.extend(beams) edit_traces.extend(traces) return beam_list, edit_traces
def from_sentences(self, query_sentences, k): query_embeds = self.seq_embedder.embed( query_sentences) # (num_queries, embed_dim) query_embeds_normed = self.normalize(query_embeds) neighbors_dict = defaultdict(list) batch_size = 128 target_batches = list(chunks(self.sentences, n=batch_size)) for target_batch in verboserate( target_batches, desc='Embedding target sentences (batched)'): target_embeds = self.seq_embedder.embed( target_batch) # (batch_size, embed_dim) target_embeds_normed = self.normalize(target_embeds) # NOTE: we are actually computing sqrt(2 - 2 * cos(theta)), not theta # <a, b> = ||a|| ||b|| cos(theta) = cos(theta) cos_thetas_batch = torch.mm(query_embeds_normed, target_embeds_normed.transpose( 0, 1)) # (num_queries, batch_size) scores_batch = torch.sqrt(2 - 2 * cos_thetas_batch) scores_batch = scores_batch.data.cpu().numpy() for i, query in enumerate(query_sentences): for j, target in enumerate(target_batch): score = scores_batch[i, j] neighbors_dict[tuple(query)].append((target, score)) neighbors_batch = [] for query in query_sentences: neighbors = neighbors_dict[tuple(query)] neighbors = sorted(neighbors, key=lambda pair: pair[1], reverse=True) neighbors = neighbors[:k] neighbors_batch.append(neighbors) return neighbors_batch
def _compute_metrics(cls, editor, examples, num_evaluate_examples, batch_size): """ Args: editor (Editor) examples (list[EditExample]) num_evaluate_examples (int) batch_size (int) Returns: stats (dict[str, float]) edit_traces (list[EditTrace]): of length num_evaluate_examples loss_traces (list[LossTrace]): of length num_evaluate_examples """ sample = sample_if_large(examples, num_evaluate_examples, replace=False) # compute loss # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, loss_traces, weights, enc_losses = [], [], [], [] for batch in verboserate(chunks(sample, batch_size), desc='Computing loss on examples'): weights.append(len(batch)) loss_var, loss_trace_batch, enc_loss = editor.loss(batch) # convert loss Variable into float loss_val = loss_var.data[0] assert isinstance(loss_val, float) losses.append(loss_val) enc_losses.append(enc_loss) loss_traces.extend(loss_trace_batch) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights) punct_table = dict.fromkeys( i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) def remove_punct(s): new_s = [] for t in s: t = unicode(t).translate(punct_table) if t != '': new_s.append(t) return new_s metrics = { 'bleu': (bleu, max), 'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s) if len(s) > 0 else len(t), min), 'exact_match': (lambda s, t: 1.0 if remove_punct(s) == remove_punct(t) else 0.0, max) } top_results = defaultdict(list) top5_results = defaultdict(list) # compute predictions beams, edit_traces = editor.edit(sample, batch_size=batch_size, max_seq_length=150, verbose=True) for ex, beam in izip(sample, beams): top = beam[0] top5 = beam[:5] target = ex.target_words for name, (fxn, best) in metrics.items(): top_results[name].append(fxn(top, target)) top5_results[name].append( best(fxn(predict, target) for predict in top5)) # compute averages stats_top = {name: np.mean(vals) for name, vals in top_results.items()} stats_top5 = { '{}_top5'.format(name): np.mean(vals) for name, vals in top5_results.items() } # combine into a single stats object stats = {'loss': loss, 'enc_loss': enc_loss} stats.update(stats_top) stats.update(stats_top5) return stats, edit_traces, loss_traces
src_dir = os.environ['COPY_EDIT_DATA'] + 'edit_runs/7' #for codalab load_expt = RetrieveEditTrainingRun(config, src_dir) ### # retedit model import numpy as np ret_model = load_expt.editor.ret_model # edit_model = load_expt.editor.edit_model # since we only care about the retriever here examples = load_expt._examples from gtd.utils import chunks from tqdm import tqdm new_vecs = [] for batch in tqdm(chunks(examples.train, 32), total=len(examples.train) / 32): encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy() for vec in encin: new_vecs.append(vec) del encin new_lsh = ret_model.make_lsh(new_vecs) eval_num = 500 validation_files = list((pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl').glob('.pickle')) # valid_eval = ret_model.ret_and_make_ex(examples.valid[0:eval_num], new_lsh, examples.train, 0, train_mode=False) # # beam_list, edit_traces = edit_model.edit(valid_eval) # since we only care about ret # # ### other
config = Config.from_file('editor_code/configs/editor/github.txt') src_dir = os.environ['COPY_EDIT_DATA'] + '/edit_runs/0' #for codalab load_expt = RetrieveEditTrainingRun(config, src_dir) import numpy as np vae_editor = load_expt.editor.vae_model ret_model = load_expt.editor.ret_model edit_model = load_expt.editor.edit_model examples = load_expt._examples from gtd.utils import chunks from tqdm import tqdm new_vecs = [] for batch in tqdm(chunks(examples.train, 32), total=len(examples.train) / 32): encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy() for vec in encin: new_vecs.append(vec) del encin new_lsh = ret_model.make_lsh(new_vecs) eval_num = 500 valid_eval = ret_model.ret_and_make_ex(examples.test[0:eval_num], new_lsh, examples.train, 0, train_mode=False) beam_list, edit_traces = edit_model.edit(valid_eval)
voc_vec_rest[in_vocab_id[i]] = 0 if in_vocab_id[i] == unk_idx: gold_rank = np.sum(voc_vec_rest >= voc_vec[copy_token_id[i]]) else: gold_rank = np.sum(voc_vec_rest >= voc_vec[copy_token_id[i]] + voc_vec[in_vocab_id[i]]) if target_mask[i] == 1.0: all_ranks_noret[i].append(gold_rank) position += 1 del token_list del vocab_probs return all_ranks_noret all_ranks_noret = [] for chunk in tqdm(chunks(examples.test[0:eval_num], 16), total=eval_num / 16): all_ranks_noret.extend(eval_batch_noret(chunk)) ### # base retriever. import gtd.retrieval_func as rf lsh, dict = rf.make_hash(examples.train) output_index = rf.grab_nbs(examples.test[0:eval_num], lsh, dict) ret_pred = rf.generate_predictions(examples.train, output_index) def agree_vec(ref, targ): rank_vec = [] for i in range(max(len(ref), len(targ))): if i < len(targ) and i < len(ref): agree_ind = ref[i] == targ[i]