def test_collate(): arrs = [[random() for _ in range(100)] for _ in range(3)] collated = collate(arrs) assert list(iter(collated)) == list(zip(*arrs)) assert [collated[i] for i in range(len(collated))] == list(zip(*arrs)) assert list(collated[:-5]) == list(zip(*arrs))[:-5] arrs.append([]) with pytest.raises(ValueError): collate(arrs)
def _preload_data(self, file_path, batch_size=1000, max_len=None): seqs = seq_all(file_path) codes, docs = seqs["codes"], seqs["docs"] tok_codes = sq.smap(tokenize_plus(self.tokenizer, max_len, True), codes) tok_docs = sq.smap(tokenize_plus(self.tokenizer, max_len, True), docs) return sq.collate([tok_codes, tok_docs])
def _preload_data(self, file_path, label_file_path, batch_size=1000, max_len=None): seqs = seq_all(file_path) codes = seqs["codes"] label_index_df = pd.read_pickle(label_file_path) piece_labels = label_index_df["label"] indexes = label_index_df["index"] sample_ids = label_index_df["sample_id"] code_pieces = fetch_code_pieces(codes, sample_ids, indexes) ast_label_seqs = seq_from_code_ast(seqs) sub_codes, labels = ast_label_seqs["sub_code_pieces"], ast_label_seqs[ self.hparams["snake_params"].label_type] # TODO try different sample strategy for sub_codes. # code_pieces, piece_labels = sq.concatenate(sub_codes), sq.concatenate(labels) # code_pieces = sq.smap(utf8decode, code_pieces) tok_codes = sq.smap(tokenize_plus(self.tokenizer, max_len, True), code_pieces) tok_piece_labels = sq.smap(label_tokenize(self.label_tokenizer), piece_labels) return sq.collate([tok_codes, tok_piece_labels])
def _preload_data(self, file_path, batch_size=1000, max_len=None): seqs = seq_all(file_path) codes = seqs["codes"] docs = seqs["docs"] # TODO try different sample strategy for sub_codes. tok_both = sq.smap(tokenize_pair_plus(self.tokenizer, max_len, True), docs, codes) # FIXME: here <PAD> included for random mask. tok_only = sq.smap(lambda x: x["input_ids"], tok_both) tok_piece_labels = sq.smap( random_mask(list(self.tokenizer.get_vocab().values())), tok_only) return sq.collate([tok_both, tok_piece_labels])
def predict_fn(feature_sequences): durations = np.array([len(s) for s in feature_sequences[0]]) step = max_time - 2 * warmup # turn sequences chunks = [(i, k, min(d, k + max_time)) for i, d in enumerate(durations) for k in range(0, d - warmup, step)] chunked_sequences = [] for feat in feature_sequences: def get_chunk(i, t1, t2, feat_=feat): return adjust_length(feat_[i][t1:t2], size=max_time, pad=0) chunked_sequences.append(seqtools.starmap(get_chunk, chunks)) chunked_sequences.append([np.int32(t2 - t1) for _, t1, t2 in chunks]) chunked_sequences = seqtools.collate(chunked_sequences) # turn into minibatches null_sample = chunked_sequences[0] n_features = len(null_sample) def collate(b): return [ np.array([b[i][c] for i in range(batch_size)]) for c in range(n_features) ] minibatches = seqtools.batch(chunked_sequences, batch_size, pad=null_sample, collate_fn=collate) # minibatches = seqtools.prefetch( # minibatches, max_cached=nworkers * 5, nworkers=nworkers) # process batched_predictions = seqtools.starmap(predict_batch_fn, minibatches) batched_predictions = seqtools.add_cache(batched_predictions) chunked_predictions = seqtools.unbatch(batched_predictions, batch_size) # recompose out = [ np.empty((d, ) + l_out.output_shape[2:], dtype=np.float32) for d in durations ] for v, (s, start, stop) in zip(chunked_predictions, chunks): skip = warmup if start > 0 else 0 out[s][start + skip:stop] = v[skip:stop - start] return out
code_tokenized = sq.smap(tokenize(code_token_counter), code_tokens) doc_tokenized = sq.smap(tokenize(doc_token_counter), doc_tokens) code_pad = sq.smap(pad_to_1d(code_token_counter, 200), code_tokenized) doc_pad = sq.smap(pad_to_1d(doc_token_counter, 200), doc_tokenized) #batch and pad dataset. model = CodeQuerySoftmaxBertModel(code_token_counter, doc_token_counter) opt = torch.optim.Adam(model.parameters()) from tqdm import trange, tqdm train_data = sq.collate(doc_pad, code_pad) for epoch in trange(50): for query_batch, code_batch in tqdm( torch.utils.data.DataLoader(train_data)): opt.zero_grad() query_embeddings, code_embeddings, losses, reiprocal_rank, mrr = model( query_batch, code_batch) losses.mean().backward() opt.step() #TODO: add validation and early stopping. #TODO: add test via test-dataset. test_code_pad, test_doc_pad = ... # previous
def transfer_feat_seqs(transfer_from, freeze_at): import theano import theano.tensor as T import lasagne from sltools.nn_utils import adjust_length from experiments.utils import reload_best_hmm, reload_best_rnn report = shelve.open(os.path.join(cachedir, transfer_from)) if report['meta']['modality'] == "skel": source_feat_seqs = [skel_feat_seqs] elif report['meta']['modality'] == "bgr": source_feat_seqs = [bgr_feat_seqs] elif report['meta']['modality'] == "fusion": source_feat_seqs = [skel_feat_seqs, bgr_feat_seqs] else: raise ValueError() # no computation required if freeze_at == "inputs": return source_feat_seqs # reuse cached features dump_file = os.path.join( cachedir, report['meta']['experiment_name'] + "_" + freeze_at + "feats.npy") if os.path.exists(dump_file): boundaries = np.stack( (np.cumsum(durations) - durations, np.cumsum(durations)), axis=1) return [split_seq(np.load(dump_file, mmap_mode='r'), boundaries)] # reload model if report['meta']['model'] == "hmm": _, recognizer, _ = reload_best_hmm(report) l_in = recognizer.posterior.l_in if freeze_at == "embedding": l_feats = recognizer.posterior.l_feats elif freeze_at == "logits": l_feats = recognizer.posterior.l_raw elif freeze_at == "posteriors": l_feats = lasagne.layers.NonlinearityLayer( recognizer.posterior.l_out, T.exp) else: raise ValueError() batch_size, max_time, *_ = l_in[0].output_shape # TODO: fragile warmup = recognizer.posterior.warmup else: _, model_dict, _ = reload_best_rnn(report) l_in = model_dict['l_in'] l_feats = model_dict['l_feats'] batch_size, max_time, *_ = l_in[0].output_shape # TODO: fragile warmup = model_dict['warmup'] feats_var = lasagne.layers.get_output(l_feats, deterministic=True) predict_batch_fn = theano.function([l.input_var for l in l_in], feats_var) step = max_time - 2 * warmup # turn sequences into chunks chunks = [(i, k, min(d, k + max_time)) for i, d in enumerate(durations) for k in range(0, d - warmup, step)] chunked_sequences = [] for feat in source_feat_seqs: def get_chunk(i, t1, t2, feat_=feat): return adjust_length(feat_[i][t1:t2], size=max_time, pad=0) chunked_sequences.append(seqtools.starmap(get_chunk, chunks)) chunked_sequences = seqtools.collate(chunked_sequences) # turn into minibatches null_sample = chunked_sequences[0] n_features = len(null_sample) def collate(b): return [ np.array([b[i][c] for i in range(batch_size)]) for c in range(n_features) ] minibatches = seqtools.batch(chunked_sequences, batch_size, pad=null_sample, collate_fn=collate) # minibatches = seqtools.prefetch(minibatches, nworkers=2, max_buffered=10) # process batched_predictions = seqtools.starmap(predict_batch_fn, minibatches) batched_predictions = seqtools.add_cache(batched_predictions) chunked_predictions = seqtools.unbatch(batched_predictions, batch_size) # recompose feat_size = l_feats.output_shape[2:] storage = open_memmap(dump_file, 'w+', dtype=np.float32, shape=(sum(durations), ) + feat_size) subsequences = np.stack( [np.cumsum(durations) - durations, np.cumsum(durations)], axis=1) out_view = seqtools.split(storage, subsequences) for v, (s, start, stop) in zip(chunked_predictions, chunks): skip = warmup if start > 0 else 0 out_view[s][start + skip:stop] = v[skip:stop - start] return [out_view]