Beispiel #1
0
def vectorize(sent):
    tokens = text.tokenize(sent)
    vocab = data.vocab()
    ixs = [vocab[t] for t in tokens]
    embeddings = data.glove()
    _vectors = [embeddings[ix].reshape(1, -1) for ix in ixs]
    return np.concatenate(_vectors, axis=0).sum(axis=0)
Beispiel #2
0
 def tokenize_and_lookup(self, sents):
     sents = [text.tokenize(s) for s in sents]
     ixs = [[self.vocab[t] for t in s] for s in sents]
     lens = [len(s) for s in sents]
     self.pad(ixs)
     return torch.LongTensor(ixs), torch.LongTensor(lens)
Beispiel #3
0
    return [x for sublist in list_of_lists for x in sublist]


if __name__ == '__main__':
    print('Building ARCT vocab...')

    # grab all sents from all data subsets
    datasets = ['train', 'dev', 'test']
    sent_cols = ['claim', 'reason', 'warrant0', 'warrant1']
    sents = []
    for dataset in datasets:
        df = data.load(dataset)
        for _, row in df.iterrows():
            for col in sent_cols:
                sents.append(row[col])

    # tokenize
    tokens = set(flatten([text.tokenize(s) for s in sents]))

    # build the vocab dictionary
    vocab = dict(zip(tokens, range(len(tokens))))
    rev_vocab = {v: k for k, v in vocab.items()}

    # save the vocab dictionary
    vocab_path = os.path.join(glovar.ARCT_DIR, 'vocab.json')
    rev_vocab_path = os.path.join(glovar.ARCT_DIR, 'rev_vocab.json')
    with open(vocab_path, 'w') as f:
        f.write(json.dumps(vocab))
    with open(rev_vocab_path, 'w') as f:
        f.write(json.dumps(rev_vocab))