Ejemplo n.º 1
0
def test_collate():
    arrs = [[random() for _ in range(100)] for _ in range(3)]
    collated = collate(arrs)
    assert list(iter(collated)) == list(zip(*arrs))
    assert [collated[i] for i in range(len(collated))] == list(zip(*arrs))
    assert list(collated[:-5]) == list(zip(*arrs))[:-5]

    arrs.append([])
    with pytest.raises(ValueError):
        collate(arrs)
Ejemplo n.º 2
0
 def _preload_data(self, file_path, batch_size=1000, max_len=None):
     seqs = seq_all(file_path)
     codes, docs = seqs["codes"], seqs["docs"]
     tok_codes = sq.smap(tokenize_plus(self.tokenizer, max_len, True),
                         codes)
     tok_docs = sq.smap(tokenize_plus(self.tokenizer, max_len, True), docs)
     return sq.collate([tok_codes, tok_docs])
    def _preload_data(self,
                      file_path,
                      label_file_path,
                      batch_size=1000,
                      max_len=None):
        seqs = seq_all(file_path)
        codes = seqs["codes"]
        label_index_df = pd.read_pickle(label_file_path)
        piece_labels = label_index_df["label"]
        indexes = label_index_df["index"]
        sample_ids = label_index_df["sample_id"]
        code_pieces = fetch_code_pieces(codes, sample_ids, indexes)

        ast_label_seqs = seq_from_code_ast(seqs)
        sub_codes, labels = ast_label_seqs["sub_code_pieces"], ast_label_seqs[
            self.hparams["snake_params"].label_type]

        # TODO try different sample strategy for sub_codes.
        # code_pieces, piece_labels = sq.concatenate(sub_codes), sq.concatenate(labels)
        # code_pieces = sq.smap(utf8decode, code_pieces)
        tok_codes = sq.smap(tokenize_plus(self.tokenizer, max_len, True),
                            code_pieces)
        tok_piece_labels = sq.smap(label_tokenize(self.label_tokenizer),
                                   piece_labels)
        return sq.collate([tok_codes, tok_piece_labels])
Ejemplo n.º 4
0
 def _preload_data(self, file_path, batch_size=1000, max_len=None):
     seqs = seq_all(file_path)
     codes = seqs["codes"]
     docs = seqs["docs"]
     # TODO try different sample strategy for sub_codes.
     tok_both = sq.smap(tokenize_pair_plus(self.tokenizer, max_len, True),
                        docs, codes)
     # FIXME: here <PAD> included for random mask.
     tok_only = sq.smap(lambda x: x["input_ids"], tok_both)
     tok_piece_labels = sq.smap(
         random_mask(list(self.tokenizer.get_vocab().values())), tok_only)
     return sq.collate([tok_both, tok_piece_labels])
Ejemplo n.º 5
0
    def predict_fn(feature_sequences):
        durations = np.array([len(s) for s in feature_sequences[0]])
        step = max_time - 2 * warmup

        # turn sequences
        chunks = [(i, k, min(d, k + max_time)) for i, d in enumerate(durations)
                  for k in range(0, d - warmup, step)]
        chunked_sequences = []
        for feat in feature_sequences:

            def get_chunk(i, t1, t2, feat_=feat):
                return adjust_length(feat_[i][t1:t2], size=max_time, pad=0)

            chunked_sequences.append(seqtools.starmap(get_chunk, chunks))
        chunked_sequences.append([np.int32(t2 - t1) for _, t1, t2 in chunks])
        chunked_sequences = seqtools.collate(chunked_sequences)

        # turn into minibatches
        null_sample = chunked_sequences[0]
        n_features = len(null_sample)

        def collate(b):
            return [
                np.array([b[i][c] for i in range(batch_size)])
                for c in range(n_features)
            ]

        minibatches = seqtools.batch(chunked_sequences,
                                     batch_size,
                                     pad=null_sample,
                                     collate_fn=collate)
        # minibatches = seqtools.prefetch(
        #     minibatches, max_cached=nworkers * 5, nworkers=nworkers)

        # process
        batched_predictions = seqtools.starmap(predict_batch_fn, minibatches)
        batched_predictions = seqtools.add_cache(batched_predictions)
        chunked_predictions = seqtools.unbatch(batched_predictions, batch_size)

        # recompose
        out = [
            np.empty((d, ) + l_out.output_shape[2:], dtype=np.float32)
            for d in durations
        ]

        for v, (s, start, stop) in zip(chunked_predictions, chunks):
            skip = warmup if start > 0 else 0
            out[s][start + skip:stop] = v[skip:stop - start]

        return out
Ejemplo n.º 6
0
code_tokenized = sq.smap(tokenize(code_token_counter), code_tokens)
doc_tokenized = sq.smap(tokenize(doc_token_counter), doc_tokens)

code_pad = sq.smap(pad_to_1d(code_token_counter, 200), code_tokenized)
doc_pad = sq.smap(pad_to_1d(doc_token_counter, 200), doc_tokenized)

#batch and pad dataset.

model = CodeQuerySoftmaxBertModel(code_token_counter, doc_token_counter)

opt = torch.optim.Adam(model.parameters())

from tqdm import trange, tqdm

train_data = sq.collate(doc_pad, code_pad)

for epoch in trange(50):
    for query_batch, code_batch in tqdm(
            torch.utils.data.DataLoader(train_data)):
        opt.zero_grad()
        query_embeddings, code_embeddings, losses, reiprocal_rank, mrr = model(
            query_batch, code_batch)
        losses.mean().backward()
        opt.step()

#TODO: add validation and early stopping.

#TODO: add test via test-dataset.

test_code_pad, test_doc_pad = ...  # previous
Ejemplo n.º 7
0
def transfer_feat_seqs(transfer_from, freeze_at):
    import theano
    import theano.tensor as T
    import lasagne
    from sltools.nn_utils import adjust_length
    from experiments.utils import reload_best_hmm, reload_best_rnn

    report = shelve.open(os.path.join(cachedir, transfer_from))

    if report['meta']['modality'] == "skel":
        source_feat_seqs = [skel_feat_seqs]
    elif report['meta']['modality'] == "bgr":
        source_feat_seqs = [bgr_feat_seqs]
    elif report['meta']['modality'] == "fusion":
        source_feat_seqs = [skel_feat_seqs, bgr_feat_seqs]
    else:
        raise ValueError()

    # no computation required
    if freeze_at == "inputs":
        return source_feat_seqs

    # reuse cached features
    dump_file = os.path.join(
        cachedir,
        report['meta']['experiment_name'] + "_" + freeze_at + "feats.npy")
    if os.path.exists(dump_file):
        boundaries = np.stack(
            (np.cumsum(durations) - durations, np.cumsum(durations)), axis=1)
        return [split_seq(np.load(dump_file, mmap_mode='r'), boundaries)]

    # reload model
    if report['meta']['model'] == "hmm":
        _, recognizer, _ = reload_best_hmm(report)
        l_in = recognizer.posterior.l_in
        if freeze_at == "embedding":
            l_feats = recognizer.posterior.l_feats
        elif freeze_at == "logits":
            l_feats = recognizer.posterior.l_raw
        elif freeze_at == "posteriors":
            l_feats = lasagne.layers.NonlinearityLayer(
                recognizer.posterior.l_out, T.exp)
        else:
            raise ValueError()
        batch_size, max_time, *_ = l_in[0].output_shape  # TODO: fragile
        warmup = recognizer.posterior.warmup

    else:
        _, model_dict, _ = reload_best_rnn(report)
        l_in = model_dict['l_in']
        l_feats = model_dict['l_feats']
        batch_size, max_time, *_ = l_in[0].output_shape  # TODO: fragile
        warmup = model_dict['warmup']

    feats_var = lasagne.layers.get_output(l_feats, deterministic=True)
    predict_batch_fn = theano.function([l.input_var for l in l_in], feats_var)

    step = max_time - 2 * warmup

    # turn sequences into chunks
    chunks = [(i, k, min(d, k + max_time)) for i, d in enumerate(durations)
              for k in range(0, d - warmup, step)]
    chunked_sequences = []
    for feat in source_feat_seqs:

        def get_chunk(i, t1, t2, feat_=feat):
            return adjust_length(feat_[i][t1:t2], size=max_time, pad=0)

        chunked_sequences.append(seqtools.starmap(get_chunk, chunks))
    chunked_sequences = seqtools.collate(chunked_sequences)

    # turn into minibatches
    null_sample = chunked_sequences[0]
    n_features = len(null_sample)

    def collate(b):
        return [
            np.array([b[i][c] for i in range(batch_size)])
            for c in range(n_features)
        ]

    minibatches = seqtools.batch(chunked_sequences,
                                 batch_size,
                                 pad=null_sample,
                                 collate_fn=collate)
    # minibatches = seqtools.prefetch(minibatches, nworkers=2, max_buffered=10)

    # process
    batched_predictions = seqtools.starmap(predict_batch_fn, minibatches)
    batched_predictions = seqtools.add_cache(batched_predictions)
    chunked_predictions = seqtools.unbatch(batched_predictions, batch_size)

    # recompose
    feat_size = l_feats.output_shape[2:]
    storage = open_memmap(dump_file,
                          'w+',
                          dtype=np.float32,
                          shape=(sum(durations), ) + feat_size)
    subsequences = np.stack(
        [np.cumsum(durations) - durations,
         np.cumsum(durations)], axis=1)
    out_view = seqtools.split(storage, subsequences)

    for v, (s, start, stop) in zip(chunked_predictions, chunks):
        skip = warmup if start > 0 else 0
        out_view[s][start + skip:stop] = v[skip:stop - start]

    return [out_view]