Exemple #1
0
    def make_iterator(self, train):
        args = self.args
        t1, t2, t3 = [], [], []

        # Segregate training data by its output length
        for row in train:
            out_len = len(row.out)
            if out_len < 100:
                t1.append(row)
            elif 100 < out_len < 220:
                t2.append(row)
            else:
                t3.append(row)
        t1_dataset = data.Dataset(t1, self.fields)
        t2_dataset = data.Dataset(t2, self.fields)
        t3_dataset = data.Dataset(t3, self.fields)
        valid = data.TabularDataset(path=args.path.replace("train", "val"),
                                    format='tsv',
                                    fields=self.fields)

        print("Dataset Sizes (t1, t2, t3, valid):", end=' ')
        for dataset in [t1_dataset, t2_dataset, t3_dataset, valid]:
            print(len(dataset.examples), end=' ')

            for row in dataset:
                row.rawent = row.ent.split(" ; ")
                row.ent = self.vectorize_entity(row.ent, self.ENT)
                # row.ent: tuple of ((# of entities in x, max entity len), (# of entities))

                row.rel = self.make_graph(row.rel, len(row.ent[1]))

                row.tgt = row.out
                row.out = [
                    y.split("_")[0] + ">" if "_" in y else y for y in row.out
                ]
                # row.out: removes tag indices for out (e.g. <method_0> => <method>

            dataset.fields["tgt"] = self.TARGET
            dataset.fields["rawent"] = data.RawField()
            dataset.fields["rawent"].is_target = False

        self.t1_iter = data.Iterator(t1_dataset,
                                     args.t1size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.t2_iter = data.Iterator(t2_dataset,
                                     args.t2size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.t3_iter = data.Iterator(t3_dataset,
                                     args.t3size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.val_iter = data.Iterator(valid,
                                      args.t3size,
                                      device=args.device,
                                      sort_key=lambda x: len(x.out),
                                      sort=False,
                                      repeat=False,
                                      train=False)
Exemple #2
0
def load_jester(load_text=True,
                batch_size=1,
                subsample_rate=1.0,
                repeat=False,
                shuffle=True,
                ratings_path='jester_ratings.dat.gz',
                jokes_path='jester_items.clean.dat.gz',
                max_vocab_size=150,
                gpu=False):
    DEV = 0 if gpu else -1
    assert os.path.exists(
        jokes_path), "jokes file %s does not exist!" % jokes_path
    assert os.path.exists(
        ratings_path), "ratings file %s does not exist!" % ratings_path
    text_field = data.Field(lower=True, include_lengths=True, batch_first=True)
    rating_field = data.Field(sequential=False, use_vocab=False)
    user_field = data.Field(sequential=False, use_vocab=False)
    joke_field = data.Field(sequential=False, use_vocab=False)
    if load_text:
        fields = [('text', text_field), ('ratings', rating_field),
                  ('users', user_field), ('jokes', joke_field)]
    else:
        fields = [('ratings', rating_field), ('users', user_field),
                  ('jokes', joke_field)]
    jokes_text = {}
    joke = -1
    all_tokens = []
    with gzip.open(jokes_path) as f:
        for i, line in enumerate(f):
            l = line.decode('utf-8')
            if len(l.strip()) == 0:
                continue
            if l.strip()[-1] == ':':
                joke = int(l.strip().strip(':'))
            else:
                joke_text = l.strip()
                tokens = l.strip().split()
                all_tokens.extend(tokens)
                jokes_text[joke] = joke_text
    counts = Counter(all_tokens)
    most_common = counts.most_common(max_vocab_size)
    most_common = set([item[0] for item in most_common])

    print('Loading Data, this might take several minutes')
    if subsample_rate < 1.0:
        print('Subsampling rate set to %f' % subsample_rate)

    train, val, test = [], [], []
    with gzip.open(ratings_path) as f:
        for i, l in enumerate(f):
            if i % 100000 == 0:
                print('%d lines read' % i)
            user, joke, rating = l.split()
            user = int(user)
            joke = int(joke)
            rating = int(rating)
            if load_text:
                assert joke in jokes_text
                example = Example.fromlist([
                    ' '.join([
                        item for item in jokes_text[joke].split()
                        if item in most_common
                    ]), rating, user, joke
                ], fields)
            else:
                example = Example.fromlist([rating, user, joke], fields)
            p = random.random()
            q = random.random()
            if p < 0.98:
                if q < subsample_rate:
                    train.append(example)
            elif p < 0.99:
                val.append(example)
            elif p < 1.0:
                test.append(example)
        train = data.Dataset(train, fields)
        val = data.Dataset(val, fields)
        test = data.Dataset(test, fields)
        train_iter, val_iter, test_iter = data.BucketIterator.splits(
            (train, val, test),
            batch_size=batch_size,
            device=DEV,
            repeat=repeat,
            shuffle=shuffle)
        train_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text'
                                                               ) else 0
        val_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text') else 0
        test_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text') else 0

    print('Data Loaded')

    if load_text:
        text_field.build_vocab(train)
        return train_iter, val_iter, test_iter, text_field
    else:
        return train_iter, val_iter, test_iter,
Exemple #3
0
Tweet = data.Field(sequential=True,
                   tokenize='spacy',
                   batch_first=True,
                   include_lengths=True)
Label = data.LabelField(tokenize='spacy',
                        is_target=True,
                        batch_first=True,
                        sequential=False)
col = df.columns
fields = [(col[0], Tweet), (col[1], Label)]
example = [
    data.Example.fromlist([df.tweets[i], df.labels[i]], fields)
    for i in range(df.shape[0])
]
twitterDataset = data.Dataset(example, fields)
(train, valid) = twitterDataset.split(split_ratio=[0.85, 0.15],
                                      random_state=random.seed(SEED))
# print((len(train), len(valid)))

# print(vars(train.examples[10]))

Tweet.build_vocab(train)
Label.build_vocab(train)

# print('Size of input vocab : ', len(Tweet.vocab))
# print('Size of label vocab : ', len(Label.vocab))
# print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
# print('Labels : ', Label.vocab.stoi)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Exemple #4
0
    def __init__(self, args):
        path = '.data/squad'
        train_examples_paths = []
        dev_examples_paths = []
        test_examples_paths = []
        for i in args.dev_files:
            dataset_path = path + '/torchtext/' + i.replace("/", "_") + "/"
            train_examples_paths.append(dataset_path + 'train_examples.pt')
            dev_examples_paths.append(dataset_path + 'dev_examples.pt')
            test_examples_paths.append(dataset_path + 'test_examples.pt')

        print("preprocessing data files...")
        for i in args.dev_files:
            if not os.path.exists('{}/{}l'.format(path, i)):
                self.preprocess_file('{}/{}'.format(path, i))
        for i in args.train_files:
            if not os.path.exists('{}/{}l'.format(path, i)):
                self.preprocess_file('{}/{}'.format(path, i))
        for i in args.test_files:
            if not os.path.exists('{}/{}l'.format(path, i)):
                self.preprocess_file('{}/{}'.format(path, i))

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        self.train = []
        self.dev = []
        self.test = []

        if all([
                os.path.exists(i) for i in train_examples_paths +
                dev_examples_paths + test_examples_paths
        ]):
            print("loading splits...")
            for i in train_examples_paths:
                examples = torch.load(i)
                print(i, ":", len(examples))
                self.train.append(
                    data.Dataset(examples=examples, fields=list_fields))
            for i in dev_examples_paths:
                examples = torch.load(i)
                print(i, ":", len(examples))
                self.dev.append(
                    data.Dataset(examples=examples, fields=list_fields))
            for i in test_examples_paths:
                examples = torch.load(i)
                print(i, ":", len(examples))
                self.test.append(
                    data.Dataset(examples=examples, fields=list_fields))
        else:
            print("building splits...")
            for train_path, dev_path, test_path, i in zip(
                    args.train_files, args.dev_files, args.test_files,
                    range(0, len(args.train_files))):
                train, dev, test = data.TabularDataset.splits(
                    path=path,
                    train='{}l'.format(train_path),
                    validation='{}l'.format(dev_path),
                    test='{}l'.format(test_path),
                    format='json',
                    fields=dict_fields)

                try:
                    os.makedirs("".join(
                        os.path.split(train_examples_paths[i])[:-1]))
                except FileExistsError:
                    pass
                torch.save(train.examples, train_examples_paths[i])
                torch.save(dev.examples, dev_examples_paths[i])
                torch.save(test.examples, test_examples_paths[i])
                self.train.append(train)
                self.dev.append(dev)
                self.test.append(test)

        #cut too long context in the training set for efficiency.
        if args.context_threshold > 0:
            for i in range(0, len(self.train)):
                print(len(self.train[i].examples))
                self.train[i].examples = [
                    e for e in self.train[i].examples
                    if len(e.c_word) <= args.context_threshold
                ]
                print(len(self.train[i].examples))
            # self.other_train.examples = [e for e in self.other_train.examples if len(e.c_word) <= args.context_threshold]

        print("building vocab...")
        self.CHAR.build_vocab(*self.train, *self.dev, *self.test)
        self.WORD.build_vocab(*self.train,
                              *self.dev,
                              *self.test,
                              vectors=GloVe(name='6B', dim=args.word_dim))
        print("CHAR SIZE", len(self.CHAR.vocab))
        print("WORD SIZE", len(self.WORD.vocab))
        print("building iterators...")
        device = torch.device(
            "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")

        def train_bucket_iter(train):
            for i in range(0, len(train)):
                yield data.BucketIterator(train.pop(0),
                                          batch_size=args.train_batch_size,
                                          device=device,
                                          repeat=True,
                                          shuffle=True,
                                          sort_key=lambda x: len(x.c_word))

        self.train_iter = train_bucket_iter(self.train)

        def dev_bucket_iter(dev):
            for i in range(0, len(dev)):
                yield data.BucketIterator(dev.pop(0),
                                          batch_size=args.dev_batch_size,
                                          device=device,
                                          repeat=False,
                                          sort_key=lambda x: len(x.c_word))

        self.dev_iter = dev_bucket_iter(self.dev)

        def test_bucket_iter(test):
            for i in range(0, len(test)):
                yield data.BucketIterator(test.pop(0),
                                          batch_size=args.test_batch_size,
                                          device=device,
                                          repeat=False,
                                          sort_key=lambda x: len(x.c_word))

        self.test_iter = test_bucket_iter(self.test)
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char):
    if "one_sequence" in type_model:

        inputs_word = data.Field(init_token="<bos>",
                                 eos_token="<eos>",
                                 batch_first=True,
                                 include_lengths=True)

        inputs_char_nesting = data.Field(tokenize=list,
                                         init_token="<bos>",
                                         eos_token="<eos>",
                                         batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting,
                                       init_token="<bos>",
                                       eos_token="<eos>")

        inputs_word.vocab = vocab_word
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char
        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    else:
        inputs_word_query = data.Field(init_token="<bos>",
                                       eos_token="<eos>",
                                       batch_first=True,
                                       include_lengths=True)
        inputs_char_query_nesting = data.Field(tokenize=list,
                                               init_token="<bos>",
                                               eos_token="<eos>",
                                               batch_first=True)
        inputs_char_query = data.NestedField(inputs_char_query_nesting,
                                             init_token="<bos>",
                                             eos_token="<eos>")

        inputs_word_document = data.Field(init_token="<bos>",
                                          eos_token="<eos>",
                                          batch_first=True,
                                          include_lengths=True)
        inputs_char_document_nesting = data.Field(tokenize=list,
                                                  init_token="<bos>",
                                                  eos_token="<eos>",
                                                  batch_first=True)
        inputs_char_document = data.NestedField(inputs_char_document_nesting,
                                                init_token="<bos>",
                                                eos_token="<eos>")

        fields = ([(('inputs_word_query', 'inputs_char_query'),
                    (inputs_word_query, inputs_char_query)),
                   (('inputs_word_document', 'inputs_char_document'),
                    (inputs_word_document, inputs_char_document))])

        inputs_word_query.vocab = inputs_word_document.vocab = vocab_word
        inputs_char_query.vocab = inputs_char_query_nesting.vocab = \
            inputs_char_document_nesting.vocab = inputs_char_document.vocab = vocab_char

        # print(vocab_word.stoi)
        # print(vocab_char.stoi)

        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []

        for line in inputs:
            tuple_line = line.split("\t")
            example = data.Example.fromlist(tuple_line, fields)
            examples.append(example)

        dataset = data.Dataset(examples, fields)
        batchs = data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))
    # Entire input in one batch
    return batchs
def word_vector_features():
    from torchtext import data

    def word_mover_distance(model, row, granularity='w'):
        if granularity == 'w':
            q1, q2 = row.q1_wid, row.q2_wid
            return model.wmdistance(q1.split(), q2.split())
        else:
            q1, q2 = row.q1_cid, row.q2_cid
            return model.wmdistance(q1.split(), q2.split())

    # Word Mover Distance
    filepath = f'../data/word_vectors.txt'
    tmppath = f'../data/gensim_tmp_word_vector.txt'
    if not os.path.exists(tmppath):
        glove2word2vec(filepath, tmppath)
    word_model = KeyedVectors.load_word2vec_format(tmppath)

    filepath = f'../data/char_vectors.txt'
    tmppath = f'../data/gensim_tmp_char_vector.txt'
    if not os.path.exists(tmppath):
        glove2word2vec(filepath, tmppath)
    char_model = KeyedVectors.load_word2vec_format(tmppath)

    word_wmd = [
        word_mover_distance(word_model, row, 'w')
        for row in concat_df.itertuples(index=False)
    ]
    char_wmd = [
        word_mover_distance(char_model, row, 'c')
        for row in concat_df.itertuples(index=False)
    ]

    # tf-idf weighted word vector as sentence representation
    # then calculate cosine similarity, l1-norm, l2-norm
    word_embedding_path = '../data/word_vectors.txt'
    char_embedding_path = '../data/char_vectors.txt'
    cache = '../cache'
    word_vectors = Vectors(word_embedding_path, cache)
    char_vectors = Vectors(char_embedding_path, cache)
    word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
    char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
    wordTEXT = data.Field(batch_first=True)
    charTEXT = data.Field(batch_first=True)

    fields = [('q1_word', wordTEXT), ('q2_word', wordTEXT),
              ('q1_char', charTEXT), ('q2_char', charTEXT)]
    examples = [
        data.Example.fromlist(row, fields)
        for row in concat_df.itertuples(index=False)
    ]
    dataset = data.Dataset(examples, fields)

    wordTEXT.build_vocab(dataset, min_freq=1, vectors=word_vectors)
    charTEXT.build_vocab(dataset, min_freq=1, vectors=char_vectors)

    word_embedding = wordTEXT.vocab.vectors
    char_embedding = charTEXT.vocab.vectors

    num_word = word_embedding.size(0)
    num_char = char_embedding.size(0)

    word_index2idf = np.zeros(num_word)
    char_index2idf = np.zeros(num_char)

    word_counter = Counter()
    char_counter = Counter()
    for wid in question_df['wid']:
        word_counter.update(wid.split())
    for cid in question_df['cid']:
        char_counter.update(cid.split())

    N = len(concat_df)

    # 0 --> <unk>
    # 1 --> <pad>
    # start from 2
    for i in range(2, num_word):
        word = wordTEXT.vocab.itos[i]
        idf = np.log(N / word_counter[word])
        word_index2idf[i] = idf

    for i in range(2, num_char):
        char = charTEXT.vocab.itos[i]
        idf = np.log(N / char_counter[char])
        char_index2idf[i] = idf

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    word_idf = torch.tensor(word_index2idf, dtype=torch.float32).to(device)
    char_idf = torch.tensor(char_index2idf, dtype=torch.float32).to(device)

    word_embedder = nn.Embedding.from_pretrained(word_embedding).to(device)
    char_embedder = nn.Embedding.from_pretrained(char_embedding).to(device)

    word_similarity, char_similarity = [], []
    word_l1, char_l1 = [], []
    word_l2, char_l2 = [], []

    iter = data.BucketIterator(dataset,
                               1024,
                               sort_key=None,
                               shuffle=False,
                               device=torch.device('cuda:0'),
                               sort_within_batch=False)

    for data in iter:
        # [batch, seq_len]
        q1_word, q2_word, q1_char, q2_char = data.q1_word, data.q2_word, data.q1_char, data.q2_char

        q1_word_embed = word_embedder(q1_word)  # [batch, seq_len, 300]
        q2_word_embed = word_embedder(q2_word)
        q1_char_embed = char_embedder(q1_char)  # [batch, seq_len, 300]
        q2_char_embed = char_embedder(q2_char)

        batch = q1_word_embed.size(0)

        q1_word_flat = q1_word.view(-1)  # [batch * seq_len]
        q2_word_flat = q2_word.view(-1)
        q1_char_flat = q1_char.view(-1)
        q2_char_flat = q2_char.view(-1)

        q1_word_idfs = word_idf.index_select(0, index=q1_word_flat).view(
            batch, -1)  # [batch, seq_len]
        q2_word_idfs = word_idf.index_select(0, index=q2_word_flat).view(
            batch, -1)
        q1_char_idfs = char_idf.index_select(0, index=q1_char_flat).view(
            batch, -1)
        q2_char_idfs = char_idf.index_select(0, index=q2_char_flat).view(
            batch, -1)

        # q1_word_idfs = F.softmax(q1_word_idfs, dim=1).unsqueeze(-1)  # [batch, seq_len, 1]
        # q2_word_idfs = F.softmax(q2_word_idfs, dim=1).unsqueeze(-1)
        # q1_char_idfs = F.softmax(q1_char_idfs, dim=1).unsqueeze(-1)
        # q2_char_idfs = F.softmax(q2_char_idfs, dim=1).unsqueeze(-1)

        q1_word_idfs = (q1_word_idfs /
                        q1_word_idfs.sum(dim=1, keepdim=True)).unsqueeze(
                            -1)  # [batch, seq_len, 1]
        q2_word_idfs = (q2_word_idfs /
                        q2_word_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1)
        q1_char_idfs = (q1_char_idfs /
                        q1_char_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1)
        q2_char_idfs = (q2_char_idfs /
                        q2_char_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1)

        q1_word_repre = torch.bmm(q1_word_embed.transpose(1, 2),
                                  q1_word_idfs).squeeze()  # [batch, 300]
        q2_word_repre = torch.bmm(q2_word_embed.transpose(1, 2),
                                  q2_word_idfs).squeeze()
        q1_char_repre = torch.bmm(q1_char_embed.transpose(1, 2),
                                  q1_char_idfs).squeeze()
        q2_char_repre = torch.bmm(q2_char_embed.transpose(1, 2),
                                  q2_char_idfs).squeeze()

        word_cos_sim = F.cosine_similarity(q1_word_repre, q2_word_repre, dim=1)
        char_cos_sim = F.cosine_similarity(q1_char_repre, q2_char_repre, dim=1)
        word_l1_norm = torch.norm(q1_word_repre - q2_word_repre, p=1, dim=-1)
        char_l1_norm = torch.norm(q1_char_repre - q2_char_repre, p=1, dim=-1)
        word_l2_norm = torch.norm(q1_word_repre - q2_word_repre, p=2, dim=-1)
        char_l2_norm = torch.norm(q1_char_repre - q2_char_repre, p=2, dim=-1)

        word_similarity.append(word_cos_sim)
        char_similarity.append(char_cos_sim)
        word_l1.append(word_l1_norm)
        char_l1.append(char_l1_norm)
        word_l2.append(word_l2_norm)
        char_l2.append(char_l2_norm)

    word_similarity = torch.cat(word_similarity).cpu().numpy()
    char_similarity = torch.cat(char_similarity).cpu().numpy()
    word_l1 = torch.cat(word_l1).cpu().numpy()
    char_l1 = torch.cat(char_l1).cpu().numpy()
    word_l2 = torch.cat(word_l2).cpu().numpy()
    char_l2 = torch.cat(char_l2).cpu().numpy()

    rt = pd.DataFrame({
        'word_wmd': word_wmd,
        'word_wv_cos_sim': word_similarity,
        'word_wv_l1': word_l1,
        'word_wv_l2': word_l2,
        'char_wmd': char_wmd,
        'char_wv_cos_sim': char_similarity,
        'char_wv_l1': char_l1,
        'char_wv_l2': char_l2
    })
    return rt
Exemple #7
0
    def __init__(self, args):
        path = '.data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        print("preprocessing data files...")
        if not os.path.exists(f'{path}/{args.train_file}l'):
            self.preprocess_file(f'{path}/{args.train_file}')
        if not os.path.exists(f'{path}/{args.dev_file}l'):
            self.preprocess_file(f'{path}/{args.dev_file}')

        self.RAW = data.RawField()
        
        self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True)
        self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False)

        dict_fields = {'id': ('id', self.RAW),
                       's_idx': ('s_idx', self.LABEL),
                       'e_idx': ('e_idx', self.LABEL),
                       'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
                       'question': [('q_word', self.WORD), ('q_char', self.CHAR)]}

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL),
                       ('c_word', self.WORD), ('c_char', self.CHAR),
                       ('q_word', self.WORD), ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples, fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train=f'{args.train_file}l',
                validation=f'{args.dev_file}l',
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if args.context_threshold > 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train, self.dev, vectors=FastText(language='ja'))

        print("building iterators...")
        self.train_iter, self.dev_iter = \
            data.BucketIterator.splits((self.train, self.dev),
                                       batch_sizes=[args.train_batch_size, args.dev_batch_size],
                                       device=args.gpu,
                                       sort_key=lambda x: len(x.c_word))
Exemple #8
0
    def __init__(self,
                 path,
                 train_file,
                 dev_file,
                 vocab_max_size,
                 train_samples,
                 dev_samples,
                 train_batch_size,
                 dev_batch_size,
                 word_dim=100,
                 glove_tokens='840B'):
        #path = '..data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        self.train_file = train_file
        self.dev_file = dev_file
        self.context_threshold = 400
        self.word_dim = word_dim
        self.gpu = 0
        self.train_batch_size = train_batch_size
        self.dev_batch_size = dev_batch_size
        self.train_samples = train_samples
        self.dev_samples = dev_samples
        self.glove_tokens = glove_tokens

        print("preprocessing data files...")
        if not os.path.exists(path + '/' + self.train_file + 'l'):
            self.preprocess_file(path + '/' + self.train_file,
                                 self.train_samples)
        if not os.path.exists(path + '/' + self.dev_file + 'l'):
            self.preprocess_file(path + '/' + self.dev_file, self.dev_samples)

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples,
                                      fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train=f'{self.train_file}l',
                validation=f'{self.dev_file}l',
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if self.context_threshold > 0:
            self.train.examples = [
                e for e in self.train.examples
                if len(e.c_word) <= self.context_threshold
            ]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train,
                              self.dev,
                              max_size=vocab_max_size,
                              vectors=GloVe(name=self.glove_tokens,
                                            dim=self.word_dim))

        print("building iterators...")
        device = torch.device(
            f"cuda:{self.gpu}" if torch.cuda.is_available() else "cpu")
        self.train_iter, self.dev_iter = \
            data.BucketIterator.splits((self.train, self.dev),
                                       batch_sizes=[self.train_batch_size, self.dev_batch_size],
                                       device=device,
                                       sort_key=lambda x: len(x.c_word))
Exemple #9
0
f_text = data.Field(sequential=True, use_vocab=True)
f_pos_tag = data.Field(sequential=True,
                       use_vocab=False,
                       pad_token=1,
                       unk_token=0)
f_lemma = data.Field(sequential=True, use_vocab=True)
f_label = data.LabelField(tensor_type=torch.FloatTensor)
fields = [('text', f_text), ('pos', f_pos_tag), ('lemma', f_lemma),
          ('label', f_label)]

pipe = TwitterPipeline()

full_examples = pipe.process_data(IN_FILE, fields)[0]
# test_examples = pipe.process_data(
#    IN_FILE_TEST, fields)[0]
full_ds = data.Dataset(full_examples, fields)
# tst_ds = data.Dataset(test_examples, fields)

# do the splitting for trn/val with torchtext
trn_ds, val_ds, tst_ds = full_ds.split(split_ratio=[0.8, 0.1, 0.1],
                                       stratified=True,
                                       random_state=random.seed(SEED))

print(f'train len {len(trn_ds.examples)}')
print(f'val len {len(val_ds.examples)}')
print(f'test len {len(tst_ds.examples)}')

vec = torchtext.vocab.Vectors('embed_tweets_de_100D_fasttext',
                              cache='/Users/michel/Downloads/')
# validation + test data should by no means influence the model, so build the vocab just on trn
f_text.build_vocab(trn_ds, vectors=vec)
    def load_data_cls_wo_valid(self, train_file, test_file=None, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            train_file (String): path to training file
            test_file (String): path to test file
            val_file (String): path to validation file
        '''

        NLP = spacy.load('en_core_web_sm')
        tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "]

        # Creating Field for data
        # TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
        TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)

        # train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_test_examples = []
        train_examples = []
        for i in train_df.values.tolist():
            label = i[1]
            text = i[0]
            text = text.split(' ')
            category = text[-5::]
            text_str = text[0:-5]
            text_str = (' ').join(text_str)
            text_str = clean_str(text_str)
            text_str = text_str.split(' ')
            text = ['<cls>'] + text_str + ['<sep>'] + category
            text = (' ').join(text)
            example = data.Example.fromlist([text, label], datafields)
            train_examples.append(example)
            train_test_examples.append(example)

        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        # test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_examples = []
        for i in test_df.values.tolist():
            label = i[1]
            text = i[0]
            text = text.split(' ')
            category = text[-5::]
            text_str = text[0:-5]
            text_str = (' ').join(text_str)
            text_str = clean_str(text_str)
            text_str = text_str.split(' ')
            text = ['<cls>'] + text_str + ['<sep>'] + category
            text = (' ').join(text)
            example = data.Example.fromlist([text, label], datafields)
            test_examples.append(example)
            train_test_examples.append(example)

        val_data = data.Dataset(test_examples, datafields)

        train_test_data = data.Dataset(train_test_examples, datafields)

        # # If validation file exists, load it. Otherwise get validation data from training data
        # if val_file:
        #     val_df = self.get_pandas_df(val_file)
        #     # val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
        #     val_examples = []
        #     for i in val_df.values.tolist():
        #         label = i[1]
        #         text = i[0]
        #         text = text.split(' ')
        #         text = ['<cls>'] + text[0:-5] + ['<sep>'] + text[-5::]
        #         text = (' ').join(text)
        #         example = data.Example.fromlist([text, label],datafields)
        #         val_examples.append(example)
        #     val_data = data.Dataset(val_examples, datafields)
        # else:
        #     train_data, val_data = train_data.split(split_ratio=0.8)

        TEXT.build_vocab(train_test_data)
        self.vocab = TEXT.vocab
        # print('toprecreationclimbing' in TEXT.vocab.itos)

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator = data.BucketIterator(
            (val_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        # self.val_iterator, self.test_iterator = data.BucketIterator.splits(
        #     (val_data, test_data),
        #     batch_size=self.config.batch_size,
        #     sort_key=lambda x: len(x.text),
        #     repeat=False,
        #     shuffle=False)

        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} validation examples".format(len(val_data)))
    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        
        Inputs:
            w2v_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''

        # NLP = spacy.load('en')
        # tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "]

        # Creating Field for data
        TEXT = data.Field(sequential=True, tokenize=tokenizer,
                          lower=True)  #, fix_length=self.config.max_sen_len)
        LABEL = data.LabelField(dtype=np.float32)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [
                data.Example.fromlist(i, datafields)
                for i in val_df.values.tolist()
            ]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)

        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} test examples".format(len(test_data)))
        print("Loaded {} validation examples".format(len(val_data)))
Exemple #12
0
def main():

    args = handleInputs()
    setRNGSeed(args.rng_seed)
    use_pyro = args.model_type is not 'nmt'

    if args.combine_results is not None:
        if os.path.isdir('./.results'):
            combineResults(args.combine_results)
            return
        else:
            ValueError(
                ".results/ does not exist, assumed no experiments previously ran"
            )
    #create directory to store experiments
    if not os.path.isdir('./.results'):
        os.mkdir('./.results')

    #create directory for dataset source to target language pair
    exp_dir = './.results/{}_{}-{}/'.format(args.dataset, args.source,
                                            args.target)
    if not os.path.isdir(exp_dir):
        try:
            os.mkdir(exp_dir)
        except FileExistsError as e:
            logging.warning(
                "You might be trying to create {} twice (you running several runs?)"
                .format(exp_dir))

    if use_pyro:
        args_name = 'kl-anneal_{}_{}_latents_{}_particles_{}_attn_{}/'.format(
            args.kl_anneal, args.to_anneal, args.z_dim, args.num_particles,
            args.use_attention)
        if args.use_flows:
            args_name = '{}_{}_'.format(args.flow_type,
                                        args.num_flows) + args_name

        exp_dir = exp_dir + '{}_'.format(args.model_type) + args_name
    else:
        exp_dir = exp_dir + 'RNNSearch/'

    #flag on whether this is an experiment continuation or not
    if args.opt == 'test' or args.opt == 'validate':
        #if we are test or validating, it is assumed the experiment was run 1st
        args.load_latest_epoch = True
        args.load_epoch = 1

    args.load_latest_epoch = args.load_epoch >= 0 and args.load_latest_epoch
    cont_exp = args.load_epoch >= 0 or args.load_latest_epoch

    if not os.path.isdir(exp_dir):
        os.mkdir(exp_dir)
    else:
        #there's a logic gate for this...but can't remember what it is
        if not cont_exp:
            if not args.debug:
                raise ValueError(
                    "{} already exists, if change other parameter, please rename existing file"
                    .format(exp_dir))
    #keep track of all parameters used
    log_file = exp_dir + 'experiment.log'
    init_logger(log_file, cont_exp)
    if cont_exp:
        logging.info(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        )
        logging.info(
            "load_epoch ({}) set. Loading exp config (seems silly otherwise)".
            format(args.load_epoch))
        try:
            #to_pop is set to things we may want to actually update on the experiment.
            to_pop = [
                "load_epoch", "epochs", "print_every", "decode_alg", "k",
                "length_norm", "load_latest_epoch", "opt", "bleu_score"
            ]
            args = load_args(args, exp_dir, to_pop=to_pop)
        except FileNotFoundError as e:
            logging.error(
                "could not load previous arguments, are you sure you put same parameters as experiment?"
            )
            logging.error(
                "Starting experiment over and setting load_epoch = -1")
            args.load_epoch = -1
            args.load_latest_epoch = False
            cont_exp = False

    #whether or not we loaded arguments, presumably should also make sure things are da same
    write_args(args, exp_dir)

    optimization_dict = get_optimization_dict(args)

    if args.use_bpe:
        logging.info("Using BPE models : {} -> {}".format(
            args.src_bpe, args.trg_bpe))
        tokenize_src, tokenize_trg = getBPE(args.src_bpe, args.trg_bpe)
    else:
        logging.info("Using Tokenizer: {} -> {}".format(
            args.source, args.target))
        tokenize_src, tokenize_trg = getTokenizer(args.source, args.target,
                                                  args.on_whitespace)

# we include lengths to provide to the RNNs

    data_save_path = './.data/{}_data_{}_to_{}.pth'.format(
        args.dataset, args.source, args.target)

    datahandler = DataHandler(tokenize_src, tokenize_trg, LOWER, EOS_TOKEN,
                              SOS_TOKEN, PAD_TOKEN, UNK_TOKEN, args.min_freq,
                              DEVICE)
    fields = [('src', datahandler.getSRCField()),
              ('trg', datahandler.getTRGField())]
    try:
        #TODO...figure out how to make this work if possible since...loading is expensive
        f = torch.load(data_save_path, pickle_module=dill)
        logging.info(
            'found previous saved train and valid data, delete if undesired')
        datahandler.load_vocabs(f['src_vocab'], f['trg_vocab'])
        train_data = data.Dataset(f['train_examples'],
                                  fields=fields,
                                  filter_pred=None)
        valid_data = data.Dataset(f['valid_examples'],
                                  fields=fields,
                                  filter_pred=None)
        test_data = data.Dataset(f['test_examples'],
                                 fields=fields,
                                 filter_pred=None)
    except FileNotFoundError as e:
        logging.warning('could not find previous saved file, building new one')
        if args.dataset == 'tabular':
            logging.info("Using Tabular file, assumes no header in files")
            max_len = args.max_len
            train_data, valid_data, test_data = data.TabularDataset.splits(
                path='./.data/',
                format='tsv',
                train='train-{}-{}.tsv'.format(args.source, args.target),
                validation='dev-{}-{}.tsv'.format(args.source, args.target),
                test='test-{}-{}.tsv'.format(args.source, args.target),
                skip_header=False,
                fields=fields,
                filter_pred=lambda x: filter_fn(x, max_len))
        elif args.dataset == 'IWSLT':
            logging.warning(
                "You need to create val.de-en.* and test.de-en.* by merging files before"
            )
            train_data, valid_data, test_data = datasets.IWSLT.splits(
                exts=('.' + args.source, '.' + args.target),
                fields=(datahandler.getSRCField(), datahandler.getTRGField()),
                filter_pred=lambda x: filter_fn(x, args.max_len),
                validation='val',
                test='test')
        elif args.dataset == 'WMT14':
            train_data, valid_data, test_data = datasets.WMT14.splits(
                exts=('.' + args.source, '.' + args.target),
                fields=(datahandler.getSRCField(), datahandler.getTRGField()))

        datahandler.build_vocabs(train_data, args.custom_vocab_src,
                                 args.custom_vocab_trg)
        to_save = {
            'train_examples': train_data.examples,
            'valid_examples': valid_data.examples,
            'test_examples': test_data.examples,
            'src_vocab': datahandler.getSRCVocab(),
            'trg_vocab': datahandler.getTRGVocab()
        }
        torch.save(to_save, data_save_path, pickle_module=dill)

    logging.info('Vocab Sizes: {} (SRC) {} (TRG)'.format(
        len(datahandler.getSRCVocab()), len(datahandler.getTRGVocab())))
    logging.info('Train dataset Size: {}, Validation dataset Size: {}'.format(
        len(train_data), len(valid_data)))
    train_iter = datahandler.getBucketIter(train_data,
                                           batch_size=args.batch_size,
                                           train=True,
                                           sort_within_batch=True,
                                           sort_key=lambda x:
                                           (len(x.src), len(x.trg)),
                                           repeat=False)

    valid_iter = datahandler.getIter(valid_data,
                                     batch_size=1,
                                     train=False,
                                     sort=False,
                                     repeat=False)
    test_iter = datahandler.getIter(test_data,
                                    batch_size=1,
                                    train=False,
                                    sort=False,
                                    repeat=False)

    if args.use_bpe:
        trg_bpe = spm.SentencePieceProcessor()
        trg_bpe.Load(args.trg_bpe)
        src_bpe = spm.SentencePieceProcessor()
        src_bpe.Load(args.src_bpe)
    else:
        trg_bpe = None

    if args.bleu_score == 'raw':
        bleu_func = rawBLEU
    elif args.bleu_score == 'multi':
        bleu_func = get_moses_multi_bleu

    #this is where the magic starts (hopefully)
    modelfactory = ModelFactory(len(datahandler.getSRCVocab()),
                                len(datahandler.getTRGVocab()),
                                emb_size=args.emb_size,
                                hidden_size=args.hidden_size,
                                num_layers=args.num_layers,
                                dropout=args.dropout,
                                z_layer=args.z_dim,
                                pool_size=args.max_out_dim,
                                use_projection=args.use_projection)

    model = modelfactory.getModel(args.model_type,
                                  use_attention=args.use_attention)

    cond_flow_scale = 2
    if args.use_flows and args.model_type is not 'nmt':
        if args.flow_type == 'planar':
            model.loadPlanarFlows(args.num_flows, z_dim=args.z_dim)
        elif args.flow_type == 'iaf':
            model.loadIAFs(args.num_flows, z_dim=args.z_dim)
        elif args.flow_type == 'cond-planar':
            model.loadConditionalPlanarFlows(args.num_flows,
                                             args.hidden_size *
                                             cond_flow_scale,
                                             z_dim=args.z_dim)
        elif args.flow_type == 'cond-planar-v2':
            model.loadConditionalPlanarFlows_v2(args.num_flows,
                                                args.hidden_size *
                                                cond_flow_scale,
                                                z_dim=args.z_dim)
        elif args.flow_type == 'cond-iaf':
            model.loadConditionalIAFFlows(args.num_flows,
                                          args.hidden_size * cond_flow_scale,
                                          z_dim=args.z_dim)

    if not cont_exp:
        logging.info(
            "Initialializing Model parameters randomly with {} scheme".format(
                args.init_type))
        model.initParameters(args.init_type)

    if not cont_exp:
        logging.info(model)
    if USE_CUDA:
        model = model.cuda()

    #some internal hacky stuff to let me do hacky things....
    model.setTrainDataSize(len(train_data))
    model.setUnkTokenIndex(datahandler.getTRGVocab().stoi[UNK_TOKEN])
    model.setSOSTokenIndex(
        datahandler.getSRCVocab().stoi[SOS_TOKEN])  #for gnmt
    model.setPadIndex(datahandler.getSRCVocab().stoi[PAD_TOKEN])
    model.setWordDropout(args.word_dropout)
    model.setUseMeanField("Mean" in args.elbo_type)
    model.setToAnneal(args.to_anneal)
    if 'q' not in args.to_anneal and "Mean" in args.elbo_type and args.kl_anneal > 1.0:
        msg = "You are not annealing the variational distribution even though you request to anneal and are using mean field...which would use analytic form and needs to anneal q"
        logging.warning(msg)
        print(msg)

    if args.model_pth is not None:
        #model.load('./model_final.pth')
        model.load(args.model_pth)

    train_translator = Translator(
        valid_data,
        valid_iter,
        model,
        max_len=args.max_len,
        sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN],
        eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN],
        pad_index=datahandler.getPadIndex(),
        use_cuda=USE_CUDA)

    trainer = Trainer(model,
                      train_iter,
                      valid_iter,
                      use_pyro,
                      datahandler.getPadIndex(),
                      train_translator,
                      bleu_func,
                      datahandler.getTRGVocab(),
                      bpe_model=trg_bpe,
                      use_cuda=USE_CUDA,
                      savedir=exp_dir,
                      optim_dict=optimization_dict,
                      kl_anneal=args.kl_anneal,
                      use_aux_loss=args.use_aux_loss,
                      load_epoch=args.load_epoch,
                      use_latest_epoch=args.load_latest_epoch)

    if args.opt == 'all' or args.opt == 'train':
        dev_perplexities = trainer.train(num_epochs=args.epochs,
                                         print_every=args.print_every)
        torch.save(dev_perplexities, exp_dir + 'perplexities.pth')
    elif args.model_pth is None:
        # get best performing model
        logging.info("No model path provided, using best model for evaluation")
        dev_perplexities = trainer.initDevPerplexities()
        #if dev perplexities is not in order it was trained, this will not work
        best = {'i': -1, 'val_bleu': 0.0}
        for i, p in enumerate(dev_perplexities):
            cur_bleu = p['val_bleu']
            if cur_bleu > best['val_bleu']:
                best['i'] = i
                best['val_bleu'] = cur_bleu
        args.model_pth = trainer.getCheckpointPth(best['i'])
        try:
            check_pt = torch.load(args.model_pth)
            model.load(check_pt['model'])
            #with mutation...this is probably not necessary, but just in case....
            trainer.setModel(model)
        except Exception as e:
            logging.warning(
                "Failed to load a model...you do know you request to evaluate right?"
            )
    else:
        model.load(args.model_pth)

    val_or_test = args.opt == 'all' or args.opt == 'validate' or args.opt == 'test' or args.opt == 'test_lengths'

    if val_or_test:
        if args.opt == 'test' or args.opt == 'test_lengths':
            dataset = test_data
            data_iter = test_iter
        else:
            dataset = valid_data
            data_iter = valid_iter
        scores = {}

        debug = True
        if val_or_test and use_pyro and debug:
            #Test utility of latent variable
            #Another way to see how useful z is to 0 it out at translation time. That way, it gets no weight
            #This sort of test only makes sense if z is concatentaed as input at each step of decoding
            model.setUseLatent(False)
            translator = Translator(
                dataset,
                data_iter,
                model,
                max_len=args.max_len,
                sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN],
                eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN],
                pad_index=datahandler.getPadIndex(),
                use_cuda=USE_CUDA,
                k=args.k,
                length_norm=args.length_norm)

            no_latent_bleu, hypotheses, references = translator.FullEvalTranslate(
                datahandler.getTRGVocab(),
                bleu_func,
                decodefn=args.decode_alg,
                bpe_model=trg_bpe)

            #store information
            no_latent_name = exp_dir + 'no-latent-{}.tsv'.format(args.opt)
            write_translations(no_latent_name, hypotheses, references)
            scores['{}-no_latent'.format(args.opt)] = no_latent_bleu
            #subtle, but remember we need to use it after this test
            model.setUseLatent(True)

        #TODO: Probably not gonna do this...but presumably, because of mutation..., I really don't need to make another one of these...
        #Do this after the no latent test, because the Translator at this point can be used below for testing lengths
        if debug:
            translator = Translator(
                dataset,
                data_iter,
                model,
                max_len=args.max_len,
                sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN],
                eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN],
                pad_index=datahandler.getPadIndex(),
                use_cuda=USE_CUDA,
                k=args.k,
                length_norm=args.length_norm)

            bleu, hypotheses, references = translator.FullEvalTranslate(
                datahandler.getTRGVocab(),
                bleu_func,
                decodefn=args.decode_alg,
                bpe_model=trg_bpe)
            logging.info("{} BLEU score: {} which was ran using {} opt".format(
                args.bleu_score, bleu, args.opt))
            scores[args.opt] = bleu
            translation_name = exp_dir + '{}.tsv'.format(args.opt)
            write_translations(translation_name, hypotheses, references)

        joint_modeling = isinstance(model,
                                    GenerativeEncoderDecoder) or isinstance(
                                        model, VanillaJointEncoderDecoder)

        if joint_modeling and debug:
            model.setDecodeTarget(False)
            lm_translator = Translator(
                dataset,
                data_iter,
                model,
                max_len=args.max_len,
                sos_index=datahandler.getSRCVocab().stoi[SOS_TOKEN],
                eos_index=datahandler.getSRCVocab().stoi[EOS_TOKEN],
                pad_index=datahandler.getPadIndex(),
                use_cuda=USE_CUDA,
                k=args.k,
                length_norm=args.length_norm,
                do_lang_model=True)
            #Do greedy decoding only for language model. With these parameters, performance isn't expected to be tooo amazing
            bleu, hypotheses, references = lm_translator.FullEvalTranslate(
                datahandler.getSRCVocab(),
                bleu_func,
                decodefn='greedy',
                bpe_model=src_bpe)
            scores["lm-{}".format(args.opt)] = bleu
            translation_name = exp_dir + 'lm-{}.tsv'.format(args.opt)
            write_translations(translation_name, hypotheses, references)

        #collect validation "perplexity" for models, mostly for the ELBO
        if joint_modeling and debug:

            def get_lm_toks():
                return trainer.model.getSRCTokCount()

            eval_perplexity = trainer.run_lvnmt_eval(
                trainer.rebatch_iter(data_iter),
                custom_tok_count=get_lm_toks,
                count_both=True)
            #calculate perplexity of language model
            model.setTrainMT(False)
            model.setTrainLM(True)

            lm_eval_perplexity = trainer.run_lvnmt_eval(
                trainer.rebatch_iter(data_iter), custom_tok_count=get_lm_toks)
            torch.save(lm_eval_perplexity,
                       exp_dir + '{}-lm_perplexity.pth'.format(args.opt))
        else:
            eval_perplexity = trainer.run_lvnmt_eval(
                trainer.rebatch_iter(data_iter))

        torch.save(eval_perplexity,
                   exp_dir + '{}-eval_perplexity.pth'.format(args.opt))

        flow_samples = generate_flow_samples(trainer.model,
                                             trainer.rebatch_iter(data_iter),
                                             datahandler.getSRCVocab(),
                                             datahandler.getTRGVocab(),
                                             src_bpe=src_bpe,
                                             trg_bpe=trg_bpe)
        torch.save(flow_samples,
                   exp_dir + '{}-latent_spaces.pth'.format(args.opt))

        try:
            with open(exp_dir + 'bleus-{}.json'.format(args.opt),
                      'r') as bleu_scores:
                prev_bleus = json.load(bleu_scores)
        except Exception as e:
            prev_bleus = {}

        with open(exp_dir + 'bleus-{}.json'.format(args.opt),
                  'w') as bleu_scores:
            prev_bleus[len(prev_bleus)] = scores
            json.dump(prev_bleus, bleu_scores)

        if args.opt == 'test_lengths':
            logging.info("Calculating BLEU score based on sentence lengths")
            BLEUS = {}
            for length in range(5, 70, 5):
                references_of_length = []
                hypotheses_of_length = []
                #TODO this is stupidly inefficient... sort the ref - hypo pairs
                for i in range(len(references)):
                    count = len(references[i].split())
                    if (length - 4) <= count and count <= length:
                        references_of_length.append(references[i])
                        hypotheses_of_length.append(hypotheses[i])
                bleu = [bleu_func(hypotheses_of_length, references_of_length)]
                BLEUS['length={}'.format(length)] = bleu
            save_name = exp_dir + args.model_pth.split(
                '/')[-1] + "_lengths.tsv"
            pd.DataFrame.from_dict(BLEUS).to_csv(save_name,
                                                 sep='\t',
                                                 index=False)

    if args.opt == 'tuning':
        BLEUS = {}
        BLEUS_list = []
        for i in range(0, args.epochs):
            load_pth = exp_dir + 'checkpoints/epoch_{}.pth'.format(i)
            model.load(load_pth)
            translator = Translator(
                valid_data,
                valid_iter,
                model,
                max_len=60,
                sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN],
                eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN],
                pad_index=datahandler.getPadIndex(),
                use_cuda=USE_CUDA)

            bleu, hypotheses, references = translator.FullEvalTranslate(
                datahandler.getTRGVocab(),
                bleu_func,
                decodefn='greedy',
                bpe_model=trg_bpe)
            BLEUS['epoch_{}'.format(i)] = [bleu]
            BLEUS_list.append(bleu)
            logging.info(load_pth)
            logging.info('{} BLEU score {}'.format(args.bleu_score, bleu))
        logging.info("Best model for {} was {} with {} BLEU: {}".format(
            exp_dir, np.argmax(BLEUS_list), args.bleu_score, max(BLEUS_list)))
        save_name = exp_dir + "BLEU_scores.tsv"
        pd.DataFrame.from_dict(BLEUS).to_csv(save_name, sep='\t', index=False)
Exemple #13
0
    def load_my_data(self, word_embedding_pkl, pairs_pkl):
        """
        Loads the data from file
        :param word_embedding_pkl: absolute path to word_embeddings {Glove/Word2Vec}
        :param pairs_pkl:       # pkl file save data
        :param context_flag:    # 0: bairly include pairs
                                # 1: include pairs and local context
                                # 2: include pairs and global context
                                # 3: include pairs, local context and global context
        :return:
        """
        tokenizer = lambda text: [x for x in text]

        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df, test_df, val_df = self.get_my_pandas_df(
            pairs_pkl, self.config.context_flag)

        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        val_examples = [
            data.Example.fromlist(i, datafields)
            for i in val_df.values.tolist()
        ]
        val_data = data.Dataset(val_examples, datafields)

        TEXT.build_vocab(train_data, vectors=Vectors(name=word_embedding_pkl))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        print('Loaded %d training example' % len(train_data))
        print('Loaded %d test example ' % len(test_data))
        print('Loaded %d validation examples' % len(val_data))
Exemple #14
0
def process_data(hparams, predict_sentences=None):
    train_fp, dev_fp, test_fp = hparams.train_fp, hparams.dev_fp, hparams.test_fp
    hparams.bos_token_id, hparams.eos_token_id = 101, 102

    do_lower_case = 'uncased' in hparams.model_str
    tokenizer = AutoTokenizer.from_pretrained(
        hparams.model_str,
        do_lower_case=do_lower_case,
        use_fast=True,
        data_dir='data/pretrained_cache',
        add_special_tokens=False,
        additional_special_tokens=['[unused1]', '[unused2]', '[unused3]'])

    nlp = spacy.load("en_core_web_sm")
    pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

    TEXT = data.Field(use_vocab=False, batch_first=True, pad_token=pad_index)
    WORD_STARTS = data.Field(use_vocab=False, batch_first=True, pad_token=0)
    POS = data.Field(use_vocab=False, batch_first=True, pad_token=0)
    POS_INDEX = data.Field(use_vocab=False, batch_first=True, pad_token=0)
    VERB = data.Field(use_vocab=False, batch_first=True, pad_token=0)
    VERB_INDEX = data.Field(use_vocab=False, batch_first=True, pad_token=0)
    META_DATA = data.Field(sequential=False)
    VERB_WORDS = data.Field(sequential=False)
    POS_WORDS = data.Field(sequential=False)
    LABELS = data.NestedField(data.Field(use_vocab=False,
                                         batch_first=True,
                                         pad_token=-100),
                              use_vocab=False)

    fields = {
        'text': ('text', TEXT),
        'labels': ('labels', LABELS),
        'word_starts': ('word_starts', WORD_STARTS),
        'meta_data': ('meta_data', META_DATA)
    }
    if 'predict' not in hparams.mode:
        fields['pos'] = ('pos', POS)
        fields['pos_index'] = ('pos_index', POS_INDEX)
        fields['verb'] = ('verb', VERB)
        fields['verb_index'] = ('verb_index', VERB_INDEX)

    if hparams.task == 'oie':
        label_dict = {
            'NONE': 0,
            'ARG1': 1,
            'REL': 2,
            'ARG2': 3,
            'LOC': 4,
            'TIME': 4,
            'TYPE': 5,
            'ARGS': 3
        }
    else:  # hparams.task == 'conj':
        label_dict = {
            'CP_START': 2,
            'CP': 1,
            'CC': 3,
            'SEP': 4,
            'OTHERS': 5,
            'NONE': 0
        }

    cached_train_fp, cached_dev_fp, cached_test_fp = f'{train_fp}.{hparams.model_str.replace("/","_")}.pkl', f'{dev_fp}.{hparams.model_str.replace("/","_")}.pkl', f'{test_fp}.{hparams.model_str.replace("/","_")}.pkl'

    all_sentences = []
    if 'predict' in hparams.mode:
        # no caching used in predict mode
        if predict_sentences == None:  # predict
            if hparams.inp != None:
                predict_f = open(hparams.inp, 'r')
            else:
                predict_f = open(hparams.predict_fp, 'r')
            predict_lines = predict_f.readlines()
            fullstops = []
            predict_sentences = []
            for line in predict_lines:
                # Normalize the quotes - similar to that in training data
                line = line.replace('’', '\'')
                line = line.replace('”', '\'\'')
                line = line.replace('“', '\'\'')

                tokenized_line = line.split()
                predict_sentences.append(' '.join(tokenized_line) +
                                         ' [unused1] [unused2] [unused3]')
                predict_sentences.append('\n')

        predict_examples, all_sentences = _process_data(
            predict_sentences, hparams, fields, tokenizer, label_dict, None)
        META_DATA.build_vocab(
            data.Dataset(predict_examples, fields=fields.values()))

        predict_dataset = [(len(ex.text), idx, ex, fields)
                           for idx, ex in enumerate(predict_examples)]
        train_dataset, dev_dataset, test_dataset = predict_dataset, predict_dataset, predict_dataset
    else:
        if not os.path.exists(cached_train_fp) or hparams.build_cache:
            train_examples, _ = _process_data(train_fp, hparams, fields,
                                              tokenizer, label_dict, nlp)
            pickle.dump(train_examples, open(cached_train_fp, 'wb'))
        else:
            train_examples = pickle.load(open(cached_train_fp, 'rb'))

        if not os.path.exists(cached_dev_fp) or hparams.build_cache:
            dev_examples, _ = _process_data(dev_fp, hparams, fields, tokenizer,
                                            label_dict, nlp)
            pickle.dump(dev_examples, open(cached_dev_fp, 'wb'))
        else:
            dev_examples = pickle.load(open(cached_dev_fp, 'rb'))

        if not os.path.exists(cached_test_fp) or hparams.build_cache:
            test_examples, _ = _process_data(test_fp, hparams, fields,
                                             tokenizer, label_dict, nlp)
            pickle.dump(test_examples, open(cached_test_fp, 'wb'))
        else:
            test_examples = pickle.load(open(cached_test_fp, 'rb'))

        META_DATA.build_vocab(
            data.Dataset(train_examples, fields=fields.values()),
            data.Dataset(dev_examples, fields=fields.values()),
            data.Dataset(test_examples, fields=fields.values()))

        train_dataset = [(len(ex.text), idx, ex, fields)
                         for idx, ex in enumerate(train_examples)]
        dev_dataset = [(len(ex.text), idx, ex, fields)
                       for idx, ex in enumerate(dev_examples)]
        test_dataset = [(len(ex.text), idx, ex, fields)
                        for idx, ex in enumerate(test_examples)]
        train_dataset.sort()  # to simulate bucket sort (along with pad_data)

    return train_dataset, dev_dataset, test_dataset, META_DATA.vocab, all_sentences
    def load_data(self, dataset, more=False, examples=None, already_read=True):
        print("Preparing Data Loaders")
        self.sentence_field = data.Field(
            sequential=True,
            use_vocab=True,
            init_token='<BOS>',
            eos_token='<EOS>',
            #function to preprocess
            preprocessing=data.Pipeline(convert_token=preprocess),
            tensor_type=torch.LongTensor,
            lower=True,
            tokenize='spacy')
        fields = [('text', self.sentence_field)]

        if not already_read:

            datapath = None
            trainpath, validpath, testpath = None, None, None

            if dataset == 'wikitext':
                datapath = WIKI_PATH

                paths = [datapath + 'wiki.' + s + '.tokens' for s \
                                     in ['train', 'valid', 'test']]

                trainpath, validpath, testpath = paths[0], paths[1], paths[2]

            elif dataset == 'ptb':
                datapath = PTB_PATH
                paths = [
                    datapath + s + '.txt' for s in ['train', 'valid', 'test']
                ]

                trainpath, validpath, testpath = paths[0], paths[1], paths[2]

            elif dataset == 'gigaword':
                datapath = GIGA_PATH
                trainpath = datapath + 'thread5.txt'

            elif dataset == 'gigasmall':
                datapath = GIGA_PATH
                trainpath = datapath + 'gigaword_small_train.txt'
                validpath = datapath + 'gigaword_small_val.txt'
                testpath = datapath + 'gigaword_small_test.txt'

            elif dataset == 'reviews':
                trainpath = 'data/reviews/reviews.txt'

            print("Retrieving Train Data from file: {}...".format(trainpath))
            start = time()
            self.train_sentences = datasets.LanguageModelingDataset(trainpath,\
                    self.sentence_field, newline_eos = False)
            finish = time() - start
            print("Downloaded in {} minutes".format(finish / 60))
            print("Got Train Dataset with {n_tokens} words".format(n_tokens =\
                    len(self.train_sentences.examples[0].text)))

            if validpath is not None:

                print(
                    "Retrieving Valid Data from file: {}...".format(validpath))
                self.valid_sentences = datasets.LanguageModelingDataset(validpath,\
                        self.sentence_field, newline_eos = False)
            else:
                self.valid_sentences = None

            if testpath is not None:

                print("Retrieving Test Data from file: {}...".format(testpath))
                self.test_sentences = datasets.LanguageModelingDataset(testpath,\
                        self.sentence_field, newline_eos = False)
            else:
                self.test_sentences = None
        elif more:

            if examples is None:
                examples = []

            already_split = False
            if already_split:
                for i, fold in enumerate(dataset):
                    print('Reading fold:{}'.format(i))
                    pid = os.getpid()
                    py = psutil.Process(pid)
                    memoryUse = py.memory_info(
                    )[0] / 2.**30  # memory use in GB...I think
                    print('memory use:', memoryUse)
                    examples.append(data.Example.fromlist([fold], fields))
                self.train_sentences = data.Dataset(examples, fields)
                self.sentence_field.build_vocab(self.train_sentences)
                return self.sentence_field.freqs
                '''
                print("EXAMPLES")
                print(len(examples))
                print([ex.text[:100] for ex in examples])
                print("EXAMPLES END")
                '''
            else:
                examples.append(data.Example.fromlist(
                    [dataset], fields))  #[i*fold_size:(i+1)*fold_size

            one = time()
            #print("READ EXAMPLES IN {}".format(one - start))
            self.train_sentences = data.Dataset(examples, fields)
def get_dataset(load_data, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]
    examples = []

    for data in load_data:
        content = words_after_jieba(data['content'])
        label = trans_labels(data['label'])
        examples.append(Data.Example.fromlist([content, label], fields))

    return examples, fields

train_examples, train_fields = get_dataset(load_train, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(load_valid, TEXT, LABEL)
test_examples, test_fields = get_dataset(load_test, TEXT, LABEL)

train_set = Data.Dataset(train_examples, train_fields)
valid_set = Data.Dataset(valid_examples, valid_fields)
test_set = Data.Dataset(test_examples, test_fields)

# 根据词向量建立词表
TEXT.build_vocab(train_set, vectors=Vectors('w2vformat.txt'))
LABEL.build_vocab(train_set)

train_iter = Data.BucketIterator(train_set, batch_size, sort=False, device=device)
valid_iter = Data.BucketIterator(valid_set, batch_size, sort=False, device=device)
test_iter = Data.BucketIterator(test_set, batch_size, sort=False, device=device)

# print(next(iter(train_iter)).text.shape)     
# print(next(iter(train_iter)).label.shape)

# %% [markdown]
    def __init__(self, config):
        # logger
        self.logger = logging.getLogger('MC')
        # params
        self.config = config["data_loader"]["args"]
        # set path (for raw data)
        data_path = self.config["data_path"]

        # get data_path_l (for processed data (.jsonl and .pt))
        if "search" in self.config["train_file"]:
            data_path_process = os.path.join(data_path, "search")
        elif "zhidao" in self.config["train_file"]:
            data_path_process = os.path.join(data_path, "zhidao")
        else:
            raise Exception("not supported data set now!")
        data_path_process = os.path.join(data_path_process,
                                         self.config["process_info"])
        ensure_dir(data_path_process)
        # (for .pt)ls
        processed_dataset_path = data_path_process + "/torchtext/"
        train_examples_path = processed_dataset_path + f'{self.config["train_file"]}.pt'
        dev_examples_path = processed_dataset_path + f'{self.config["dev_file"]}.pt'
        test_examples_path = processed_dataset_path + f'{self.config["test_file"]}.pt'

        # define Field
        self.logger.info("construct data loader....")
        self.RAW = data.RawField()
        self.RAW.is_target = False  # 读取id值
        self.Q_WORD = data.Field(sequential=True,
                                 use_vocab=True,
                                 batch_first=True,
                                 tokenize=lambda x: x,
                                 lower=False,
                                 include_lengths=True)

        self.T_WORD = data.Field(sequential=True,
                                 use_vocab=True,
                                 batch_first=True,
                                 tokenize=lambda x: x,
                                 lower=False,
                                 include_lengths=False)
        # for multi para  [b, para_num, seq_len] or [b, para_num, seq_len, w_len]
        self.PARAS = data.NestedField(self.T_WORD,
                                      use_vocab=True,
                                      tokenize=lambda x: x,
                                      include_lengths=True)

        self.LABEL = data.Field(sequential=False,
                                use_vocab=False,
                                unk_token=None)
        self.ALL_LABELS = data.NestedField(self.LABEL,
                                           use_vocab=False,
                                           pad_token=0,
                                           dtype=torch.long)

        dict_fields = {
            'question_id': ('id', self.RAW),
            'question': ('q_word', self.Q_WORD),
            'question_type': ('question_type', self.RAW),
            'yesno_answers': ('yesno_answers', self.RAW),
            'paragraphs': ('paras_word', self.PARAS),
            's_idxs': ('s_idxs', self.ALL_LABELS),
            'e_idxs': ('e_idxs', self.ALL_LABELS),
            'answer_para_idxs': ('answer_para_idxs', self.ALL_LABELS),
            'match_scores': ('match_scores', self.RAW)
        }

        list_fields = [('id', self.RAW), ('q_word', self.Q_WORD),
                       ('question_type', self.RAW),
                       ('yesno_answers', self.RAW), ('paras_word', self.PARAS),
                       ('s_idxs', self.ALL_LABELS),
                       ('e_idxs', self.ALL_LABELS),
                       ('answer_para_idxs', self.ALL_LABELS),
                       ('match_scores', self.RAW)]

        test_dict_fields = {
            'question_id': ('id', self.RAW),
            'question': ('q_word', self.Q_WORD),
            'question_type': ('question_type', self.RAW),
            'yesno_answers': ('yesno_answers', self.RAW),
            'paragraphs': ('paras_word', self.PARAS),
        }

        test_list_fields = [
            ('id', self.RAW),
            ('q_word', self.Q_WORD),
            ('question_type', self.RAW),
            ('yesno_answers', self.RAW),
            ('paras_word', self.PARAS),
        ]

        # judge if need to build dataSet
        if not os.path.exists(train_examples_path) or not os.path.exists(
                dev_examples_path):
            self.logger.info("build train dataSet....")
            self.train, self.dev = data.TabularDataset.splits(
                path=f'{data_path_process}',
                train=f'{self.config["train_file"]}l',
                validation=f'{self.config["dev_file"]}l',
                format='json',
                fields=dict_fields)
            # save preprocessed data
            ensure_dir(processed_dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)
        else:
            self.logger.info("loading train dataSet.....")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)
            self.train = data.Dataset(examples=train_examples,
                                      fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)

        # for test data
        if not os.path.exists(test_examples_path):
            self.logger.info("build test dataSet....")
            self.test = data.TabularDataset(
                path=f'{data_path_process}/{self.config["test_file"]}l',
                format='json',
                fields=test_dict_fields)
            # save preprocessed data
            ensure_dir(processed_dataset_path)
            torch.save(self.test.examples, test_examples_path)
        else:
            self.logger.info("loading test dataSet......")
            test_examples = torch.load(test_examples_path)
            self.test = data.Dataset(examples=test_examples,
                                     fields=test_list_fields)

        # build vocab
        # vocab_cache_path = f"{data_path}/{self.config['vocab_cache']}"
        # if not os.path.exists(vocab_cache_path):
        self.logger.info("build vocab....")
        # self.CHAR.build_vocab(self.train, self.dev)
        self.PARAS.build_vocab(self.train.paras_word, self.train.q_word,
                               self.dev.paras_word, self.dev.q_word)
        self.Q_WORD.vocab = self.PARAS.vocab

        # load pretrained embeddings
        Vectors = vocab.Vectors(self.config["pretrain_emd_file"])
        self.PARAS.vocab.load_vectors(Vectors)

        #     # save vocab cache
        #     self.logger.info("save vocab....")
        #     with open(vocab_cache_path, 'wb') as fout:
        #         pickle.dump(self.PARAS.vocab, fout)
        # else:
        #     # load vocab
        #     self.logger.info(f"load vocab from {vocab_cache_path} ....")
        #     with open(vocab_cache_path, 'rb') as fin:
        #         self.PARAS.vocab = pickle.load(fin)
        #         self.WORD.vocab = self.PARAS.vocab
        #         self.Q_WORD.vocab = self.PARAS.vocab

        # just for call easy
        self.vocab_vectors = self.PARAS.vocab.vectors
        self.vocab = self.PARAS.vocab

        # build iterators
        self.logger.info("building iterators....")

        self.train_iter = data.BucketIterator(
            dataset=self.train,
            batch_size=self.config["train_batch_size"],
            device=self.config["device"],
            shuffle=True)

        self.eval_iter = data.BucketIterator(
            dataset=self.dev,
            batch_size=self.config["dev_batch_size"],
            device=self.config["device"],
            sort_key=lambda x: max(
                [max(para_len) for para_len in x.paras_word[2]]),
            sort_within_batch=False,
            shuffle=False)

        self.test_iter = data.BucketIterator(
            dataset=self.test,
            batch_size=self.config["dev_batch_size"],
            sort_key=lambda x: max(
                [max(para_len) for para_len in x.paras_word[2]]),
            sort_within_batch=False,
            device=self.config["device"],
            shuffle=False)
 def read_one(self, data_file, dataset_type="train"):
     pkl_data = pickle.load(Path(data_file).open('rb'))
     examples = [Example.fromdict(x, self.fields1) for x in pkl_data]
     dataset = data.Dataset(examples, fields=self.fields2)
     return dataset
Exemple #19
0
    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        '''
            从文件中读取数据,建立 iterators、vocabulary 和 embeddings
            Inputs:
                w2v_file(String): 预训练的词向量文件(Glove/Word2Vec)
                train_file(String): 训练数据路径
                test_file(String): 测试数据路径
                val_file(String): 验证数据路径
        '''

        NLP = spacy.load('en')
        tokenizer = lambda sent: [
            x.text for x in NLP.tokenizer(sent) if x.text != " "
        ]

        # 创建 Field 对象
        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          lower=True,
                          fix_length=self.config.max_sen_len)
        # LABEL 中的 sequential 一定要设置为 False
        LABEL = data.Field(
            sequential=False, use_vocab=False
        )  # 如果LABEL是整型,不需要 numericalize , 就需要将 use_vocab=False
        datafields = [("text", TEXT), ("label", LABEL)]

        # 将 DataFrame 中的数据添加到 torchtext.data.Dataset 中
        train_df = self.get_pandas_df(train_file)
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]  # 生成训练样本
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]  # 生成测试样本        text_data = data.Dataset(test_examples, datafields
        test_data = data.Dataset(test_examples, datafields)

        # 划分验证集
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_example = [
                data.Example.fromlist(i, datafields)
                for i in val_df.values.tolist()
            ]
            val_data = data.Dataset(val_example, datafields)
        else:
            train_data, val_data = train_data.split(
                split_ratio=0.8)  # 利用 split 划分

        # 加载预训练的 word embedding
        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        # 生成训练数据迭代对象
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        # 生成测试数据和验证数据的迭代对象
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
        print("Local {} train examples".format(len(train_data)))
        print("Local {} test examples".format(len(test_data)))
        print("Local {} validation examples".format(len(val_data)))
Exemple #20
0
vectors = vocab.Vectors(name='glove.840B.300d.txt', cache='content/drive/')

id.build_vocab(dataset)
label.build_vocab(dataset)
sent.build_vocab(dataset, vectors=vectors)
#[protects the non-nested fields(.py#629) from flattening ]

embedding_vectors = sent.vocab.vectors
vocab_size = len(sent.vocab)

train = np.array(dataset.examples[:87170])
valid = np.array(dataset.examples[87170:95239])
test = np.array(dataset.examples[95239:])

train_ds = data.Dataset(train, fields)
valid_ds = data.Dataset(valid, fields)
test_ds = data.Dataset(test, fields)

train_loader = BucketIterator(
    train_ds,
    train=True,
    batch_size=200,
    shuffle=True,
    # sort_key=lambda x: x.id,
    # device=torch.device(0),
)

valid_loader = BucketIterator(
    valid_ds,
    batch_size=200,
def load_data(embeddings=None,
              device='cpu',
              batch_size=32,
              bptt_len=35,
              path_to_data="../data",
              train="02-21.10way.clean",
              valid="22.auto.clean",
              test="23.auto.clean",
              bos_token='<bos>'):
    """
    Function that loads and prepares the Penn Treebank data in two different ways.
    The first takes fixed length pieces of the entire training text.
    The second takes each sentence as is.

    Args:
        embeddings: the pre-trained word embeddings to use.
        batch_size: size of batches.
        bptt_len: the length of the sequences in the batches
        path_to_data: where the Penn Treebank data is located.
        train, valid, test: the files to use as train/valid/test.
    """

    # Already tokenized so use identity function.
    TEXT = data.Field(lower=True, tokenize=lambda x: x)
    SENTENCE = data.Field(lower=True,
                          tokenize=lambda x: x,
                          include_lengths=True)

    lm_fields = [("text", TEXT)]
    s_fields = [("text", SENTENCE), ("target", SENTENCE)]

    print("Loading data...")

    # Extract sentences from files; turn into examples.
    splits_langmodel = []
    splits_sentences = []
    for f in [train, valid, test]:
        path = os.path.join(path_to_data, f)

        # Remove POS tags and concatenate into one list for language modelling.
        nr_lines = 0
        total_tokens = 0
        lm_example = []
        s_examples = []
        with io.open(path, encoding='utf-8') as f:
            for line in f:
                nr_lines += 1
                # remove POS tags and tree structure.
                tokens = [bos_token] + re.sub(r"\([0-9] |\)", "", line).split()
                tokens = [
                    token for token in tokens if not token.startswith('(')
                ]
                total_tokens += len(tokens)

                lm_example.extend(tokens)
                s_examples.append([tokens, tokens[1:] + [bos_token]])

        avg_length = total_tokens / nr_lines
        print("Average Sentence Length: {}".format(avg_length))

        # The language model datasets are one big Example with all sentences.
        lm_example = data.Example.fromlist([lm_example], lm_fields)
        dataset = data.Dataset([lm_example], lm_fields)
        splits_langmodel.append(dataset)

        # the sentence datasets contain each sentence as a separate Example.
        examples = [
            data.Example.fromlist(example, s_fields) for example in s_examples
        ]
        dataset = data.Dataset(examples, s_fields)
        splits_sentences.append(dataset)

    print("Done loading.")

    # To reduce vocabulary to roughly 22.000 .
    MIN_FREQ = 2

    specials = ['<unk>', '<pad>', bos_token]
    if embeddings:
        TEXT.build_vocab(*splits_langmodel,
                         min_freq=MIN_FREQ,
                         vectors=embeddings,
                         specials=specials)
    else:
        TEXT.build_vocab(*splits_langmodel,
                         min_freq=MIN_FREQ,
                         specials=specials)

    # Use BPTTIterator for LM variant.
    train, valid, test = splits_langmodel
    lm_train_iter, lm_valid_iter, lm_test_iter = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        bptt_len=bptt_len,
        shuffle=True,
        device=device)

    # Make validation/test fit in memory (multi-sample estimates required a bit more).
    VALID_TEST_BATCH_SIZE = 16

    train, valid, test = splits_sentences
    s_train_iter = data.BucketIterator(train,
                                       batch_size=batch_size,
                                       sort_key=lambda x: x,
                                       shuffle=True,
                                       sort=False,
                                       device=device)
    s_valid_iter, s_test_iter = data.BucketIterator.splits(
        (valid, test),
        batch_size=VALID_TEST_BATCH_SIZE,
        shuffle=True,
        sort=False,
        device=device)

    SENTENCE.vocab = TEXT.vocab

    return (lm_train_iter, lm_valid_iter, lm_test_iter, TEXT), \
           (s_train_iter, s_valid_iter, s_test_iter, SENTENCE)
Exemple #22
0
def preprocess(question, equation, sni_model, fields, use_sni=True):
    """
    Returns preprocessed version of question and equation using sni_model and
    fields
    """
    # handle %'s
    question = question.replace('%', ' % ')

    # handle fractions
    parser = Parser()
    fractions = re.findall('\(\d+\)/\(\d+\)', question)
    fractions = np.append(fractions, re.findall('\(\d+/\d+\)', question))
    for i, fraction in enumerate(fractions):
        #question = question.replace(fraction, str(sys.maxsize - i))
        #equation = equation.replace(fraction, str(sys.maxsize - i))
        question = question.replace(
            fraction, str(parser.evaluate(fraction, variables=None)))
        equation = equation.replace(
            fraction, str(parser.evaluate(fraction, variables=None)))

    # handle numbers with units
    question = re.sub(r'(\d+)([A-z]{1,2})', r'\1 \2', question)

    # seperate equation at operators
    equation = equation.replace('[', ' ( ')
    equation = equation.replace(']', ' ) ')
    equation = equation.replace('+', ' + ')
    equation = equation.replace('+', ' + ')
    equation = equation.replace('-', ' - ')
    equation = equation.replace('*', ' * ')
    equation = equation.replace('/', ' / ')
    equation = equation.replace('(', ' ( ')
    equation = equation.replace(')', ' ) ')
    equation = equation.replace('=', ' = ')
    equation = equation.replace('^', ' ^ ')

    # reduce %'s
    #equation = equation.replace('%', ' / 100 ')
    equation = re.sub(r'(\d*.{0,1}\d+)%', r'(\1 / 100 )', equation)

    # preprocess question
    equation = equation.split()
    question = question.split()

    # prepend and postpend null tokens to question to allow for sni window size
    # of three
    question = ['null', 'null', 'null'] + question + ['null', 'null', 'null']

    # prevent inplace changes on question
    question_copy = [t for t in question]

    #print('question_copy:', question_copy)
    # replace significant numbers in question and equation
    i = 0
    variable_values = dict()
    for j, token in enumerate(question):
        if isFloat(token):
            example = question_copy[j - 3:j + 4]
            ex = data.Example.fromlist([' '.join(example), ''], fields)
            dataset = data.Dataset([ex], fields)
            inp = None
            iterator = data.Iterator(dataset, batch_size=1)
            iterator.repeat = False
            for batch in iterator:
                inp = batch.text.t()

            if (not use_sni) or (use_sni and isSignificant(inp, sni_model)):
                #if (use_sni and isSignificant(inp, sni_model)) or (not use_sni):
                for symbol in equation:
                    if symbol == token:
                        equation[equation.index(symbol)] = '[' + chr(97 +
                                                                     i) + ']'
                character = '[' + chr(97 + i) + ']'
                variable_values[character] = token
                for q in question:
                    if q == token:
                        question[question.index(q)] = '[' + chr(97 + i) + ']'
                i += 1

    # remove pre/postpended null tokens from question
    question = question[3:-3]

    question = ' '.join(question) + '\n'
    equation = ' '.join(equation) + '\n'
    return question, equation, variable_values
Exemple #23
0
        rel_label += y_hat.tolist()
    net.train() # 更改为训练模式
    with open('./test_rel.txt', 'w')  as fp:
        for label in rel_label:
            label =  idx_to_label[label]
            fp.writelines( str(label) + '\n')
    


# In[36]:


test_path = './snli.test'
examples = []
fields = [('seq1',sequence), ('seq2', sequence)]
with open(test_path, 'r') as fp:
    contents = fp.readlines()
    for content in contents:
        seqs = content.strip().split('|||')
        examples.append(data.Example.fromlist(seqs, fields))
test_dataset = data.Dataset(examples, fields)
print(len(test_dataset))
test_iter = data.Iterator(test_dataset, batch_size= args.batch_size, shuffle = False)


# In[37]:


test(net, test_iter)

Exemple #24
0
def repeat_augment_and_train(dir_to_save, iter_func, model_wrapper, data_source, aug_algo, encoder_model, sim_measure, datasets, text_field, label_field, frac, num_classes, classifier_params, k, learning_type):
    """
    Runs k trials of augmentation & repeat-classification for a given fraction of labeled training data.
    Args:
        dir_to_save (str): directory to save models created/loaded during this process
        aug_algo (str): which augmentation algorithm to use
        encoder_model (str): encoder model to use for augmentation (w similarity measure between these encodings)
        sim_measure (str): which similarity measure to use
        datasets (list(Dataset)): train/val/test torchtext datasets
        text_field (Field): torchtext field for sentences
        label_field (LabelField): torchtext LabelField for class labels
        frac (float): Fraction of labeled training data to use
        classifier_params (dict): params for intent classifier to use on augmented data.
        k (int): Number of times to repeat augmentation-classifier training process
        learning_type (str): inductive|transductive
    Returns:
        8 statistical measures of the results of these trials
    """
    train_ds, val_ds, test_ds = datasets
    class_accs, aug_accs, aug_fracs = [], [], []
    ps, rs, fs = [], [], []

    # FOR ENTROPY HEURISTIC
    # mst_sigmas, entropies, sigmas, accs, fracs = [], [], [], [], []

    # # ABLATION STUDY
    # sigmas, f1_means, f1_stds, aug_acc_means, aug_acc_stds, frac_used_means, frac_used_stds = [],[],[],[],[],[],[]
    # for sigma in np.arange(0.035, 0.155, 0.005):
    #     sigmas.append(sigma)

    for i in tqdm(range(k), total=k):
        examples = train_ds.examples
        np.random.shuffle(examples)
        cutoff = int(frac*len(examples))
        if learning_type == "transductive":
            labeled_examples = train_ds.examples
            unlabeled_examples = test_ds.examples
        elif frac == 0: # 1 labeled eg from each class
            classes_seen = {i: 0 for i in range(num_classes)}
            labeled_examples, unlabeled_examples = [], []
            for eg in examples:
                if classes_seen[eg.y] == 0:
                    labeled_examples.append(eg)
                    classes_seen[eg.y] += 1
                else:
                    unlabeled_examples.append(eg)
        else: # at least one labeled eg from each class
            while True:
                labeled_examples = examples[:cutoff]
                unlabeled_examples = examples[cutoff:]
                if len(set([eg.y for eg in labeled_examples])) == num_classes:
                    break
                np.random.shuffle(examples)

        ##################################################################################################################
        # PROPAGATION PROCESS VISUALISATION (FOR DEMO)
        # from matplotlib import pyplot as plt
        # from pandas import DataFrame
        # from sklearn.decomposition import PCA
        # from sklearn.manifold import TSNE
        # import matplotlib.transforms as transforms

        # # EXTRACT DATA & COMPUTE DIM_REDUCED EMBEDDINGS
        # pickle.dump(labeled_examples, Path(f'./paper/{frac}_labeled_egs.pkl').open('wb'))
        # pickle.dump(unlabeled_examples, Path(f'./paper/{frac}_unlabeled_egs.pkl').open('wb'))
        # labeled_examples = pickle.load(Path(f'./paper/{frac}_labeled_egs.pkl').open('rb'))
        # unlabeled_examples = pickle.load(Path(f'./paper/{frac}_unlabeled_egs.pkl').open('rb'))
        # intents = pickle.load(Path(f'./data/ic/{data_source}/intents.pkl').open('rb'))
        # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples)
        # x_l, y_l, x_u, y_u, _ = res
        # X = np.concatenate([x_l, x_u])
        # Y = np.concatenate([y_l, y_u])
        # pca = PCA(n_components=100)
        # pca_res = pca.fit_transform(X)
        # tsne = TSNE(n_components=2, verbose=0, perplexity=30, n_iter=1000)
        # tsne_pca_res = tsne.fit_transform(pca_res)
        # ts1, ts2 = tsne_pca_res[:,0], tsne_pca_res[:,1]
        # df_tsne_pca = DataFrame([{
        #     'intent': intents[y],
        #     'x-tsne-pca': t1,
        #     'y-tsne-pca': t2,
        #     'og_idx': idx
        # } for idx, (y,t1,t2) in enumerate(zip(Y,ts1,ts2))])
        # df_tsne_pca.to_pickle(f'./paper/{frac}_dataframe.pkl')
        # df_tsne_pca = pd.read_pickle(f'./paper/{frac}_dataframe.pkl')

        # # PLOT INITIAL DATASET
        # fig, ax = plt.subplots()
        # n_l = len(labeled_examples)
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     for i, v in enumerate(values):
        #         if v[0] < n_l:
        #             ax.scatter(v[1], v[2], color=f'C{idx}', s=100, alpha=1, label=intent)
        #         else:
        #             ax.scatter(v[1], v[2], color='black', s=100, alpha=0.2)
        # title = 'propagation_initial_labeled_only'
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     ax.scatter([v[1] for v in values], [v[2] for v in values], color=f'C{idx}', s=100, alpha=1, label=intent)
        # title = 'propagation_initial_all'
        # ax.grid(b=False)
        # ax.set_ylim(-7.6, 12.5)
        # ax.set_xlim(-10.5, 5.2)
        # fig.set_size_inches(15, 10)
        # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        # plt.savefig(f'./paper/{title}.pdf', format='pdf', dpi=100)
        # plt.show()
        # assert(False)
        
        # # PRELIMINARY DATA FOR MAIN PLOT
        # dim_reduced_points = [0 for _ in range(100)]
        # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)):
        #     values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values]
        #     for v in values:
        #         dim_reduced_points[int(v[0])] = (v[1:],intent)
        # data = pickle.load(Path('./paper/propagation_data.pkl').open('rb'))
        # indices = pickle.load(Path('./paper/indices_data.pkl').open('rb'))
        # classifications = pickle.load(Path('./paper/classifications_data.pkl').open('rb'))
        # colors = {'findconnection': 'C1', 'departuretime': 'C0'}
        # intent_map = {0: 'findconnection', 1: 'departuretime'}
        # classified_indices = [0, 1]
        # classified_true_labels = ['findconnection', 'departuretime']
        # classified_intents = ['findconnection', 'departuretime']
        # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices]
        # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices]

        # # PLOT EACH RECURSION & PROPAGATION ITERATION
        # with plt.style.context('seaborn-whitegrid'):
        #     plt.rcParams['font.family'] = 'serif'
        #     plt.rcParams['mathtext.fontset'] = 'dejavuserif'

        #     # starting point plot
        #     title = '0_final'
        #     fig, ax = plt.subplots()
        #     unclassified_indices = [i for i in range(100) if i not in classified_indices]
        #     unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices]
        #     unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices]
        #     ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2)
        #     ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #     ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #     ax.text(2, 10, 'Recursion 0 -- complete', fontsize=15, color='black', ha="center", va="center")
        #     ax.grid(b=False)
        #     ax.set_ylim(-7.6, 12.5)
        #     ax.set_xlim(-10.5, 5.2)
        #     fig.set_size_inches(15, 10)
        #     plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #     plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #     plt.close()

        #     for recursion_idx, prop_data in tqdm(enumerate(data), total=len(data)):
        #         # plot results during propagation
        #         Y_us = [prop_data[0]] if len(prop_data) == 1 else np.array(prop_data)[range(0, len(prop_data), 100)]
        #         for prop_idx, Y_u in enumerate(Y_us):
        #             title = f'{recursion_idx+1}_{(prop_idx+1)*100}'
        #             fig, ax = plt.subplots()
        #             for idx, row in enumerate(Y_u):
        #                 color = colors[intent_map[np.argmax(row)]]
        #                 prob = np.max(row)
        #                 ax.scatter(unclassified_xs[idx], unclassified_ys[idx], color=color, s=100, alpha=prob*0.75)
        #             for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]):
        #                 ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1)
        #                 if intent != true_label:
        #                     ax.scatter(x, y, color='black', marker='x', s=150, alpha=1)
        #             ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #             ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #             ax.text(2, 10, f'Recursion {recursion_idx+1} -- iterating...', fontsize=15, color='black', ha="center", va="center")
        #             ax.grid(b=False)
        #             ax.set_ylim(-7.6, 12.5)
        #             ax.set_xlim(-10.5, 5.2)
        #             fig.set_size_inches(15, 10)
        #             plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #             plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #             plt.close()
                
        #         # plot the end result of each recursion - i.e. new ground truth classifications
        #         classified_indices += [i + 2 for i in indices[recursion_idx]]
        #         classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices]
        #         classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices]
        #         classified_true_labels = [dim_reduced_points[i][1] for i in classified_indices]
        #         classified_intents += [intent_map[intent_class] for intent_class in classifications[recursion_idx]]
        #         unclassified_indices = [i for i in range(100) if i not in classified_indices]
        #         unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices]
        #         unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices]
        #         title = f'{recursion_idx+1}_final'
        #         fig, ax = plt.subplots()
        #         ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2)
        #         for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]):
        #             ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1)
        #             if intent != true_label:
        #                 ax.scatter(x, y, color='black', marker='x', s=150, alpha=1)
        #         ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1])
        #         ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0])
        #         ax.text(2, 10, f'Recursion {recursion_idx+1} -- complete', fontsize=15, color='black', ha="center", va="center")
        #         ax.grid(b=False)
        #         ax.set_ylim(-7.6, 12.5)
        #         ax.set_xlim(-10.5, 5.2)
        #         fig.set_size_inches(15, 10)
        #         plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large')
        #         plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150)
        #         plt.close()
    
        # assert(False)
        ##################################################################################################################

        # # ENTROPY HEURISTIC
        # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples)
        # x_l, y_l, x_u, y_u, _ = res
        # mst_sigma, entropy, sigma, acc, frac_used = sigma_fit(x_l, y_l, x_u, y_u, num_classes, data_source)
        # mst_sigmas.append(mst_sigma); entropies.append(entropy); sigmas.append(sigma); accs.append(acc); fracs.append(frac_used)
        # continue

        if aug_algo == "eda":
            x_l, y_l = [eg.x for eg in labeled_examples], [eg.y for eg in labeled_examples]
            augmented_x_l, augmented_y_l = eda_corpus(x_l, y_l)
            new_labeled_data = [{'x': x, 'y': y} for x,y in zip(augmented_x_l, augmented_y_l)]
            augmented_train_examples = [Example.fromdict(x, {'x': ('x', text_field), 'y': ('y', label_field)}) for x in new_labeled_data]
            aug_acc = 1; frac_used = 0
        elif aug_algo == "none":
            augmented_train_examples = labeled_examples
            aug_acc = 1; frac_used = 0
        elif aug_algo == "self_feed":
            sf_thresh = 0.7
            augmented_train_examples, aug_acc, frac_used = self_feed(data_source, dir_to_save, iter_func, model_wrapper, labeled_examples, unlabeled_examples, val_ds, test_ds, text_field, label_field, classifier_params, thresh=sf_thresh)
        else:
            augmented_train_examples, aug_acc, frac_used = augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None)
        
        aug_accs.append(aug_acc); aug_fracs.append(frac_used)
        new_train_ds = data.Dataset(augmented_train_examples, {'x': text_field, 'y': label_field})
        new_datasets = (new_train_ds, val_ds, test_ds)

        if learning_type == "inductive":
            acc, p, r, f = do_basic_train_and_classify(new_train_ds, test_ds, classifier_params, data_source)
        else: # transductive
            predictions = [eg.y for eg in augmented_train_examples[len(train_ds.examples):]]
            test_Y = [eg.y for eg in test_ds.examples]
            acc = accuracy_score(predictions, test_Y)
            avg = "macro avg" if data_source == "chat" else "weighted avg"
            report = classification_report(predictions, test_Y, output_dict=True)[avg]
            p, r, f = report['precision'], report['recall'], report['f1-score']
        
        class_accs.append(acc); ps.append(p); rs.append(r); fs.append(f)

    # # ENTROPY HEURISTIC
    # print(np.mean(entropies), np.std(entropies))
    # print(np.mean(mst_sigmas), np.std(mst_sigmas))
    # print(np.mean(sigmas), np.std(sigmas))
    # print(np.mean(accs), np.std(accs))
    # print(np.mean(fracs), np.std(fracs))
    # assert(False)

    # # ABLATION STUDY
    # print(f"SIGMA: {sigma}")
    # f1_means.append(np.mean(class_accs)); f1_stds.append(np.std(class_accs))
    # aug_acc_means.append(np.mean(aug_accs)); aug_acc_stds.append(np.std(aug_accs))
    # frac_used_means.append(np.mean(aug_fracs)); frac_used_stds.append(np.std(aug_fracs))
    # assert(False)

    print(f"FRAC '{frac}' Results Below:")
    print(f'classification acc --> mean: {np.mean(class_accs)}; std: {np.std(class_accs)}')
    print(f'augmentation acc --> mean: {np.mean(aug_accs)}; std: {np.std(aug_accs)}\t (average frac used: {np.mean(aug_fracs)})')
    print(f'p/r/f1 means --> precision mean: {np.mean(ps)}; recall mean: {np.mean(rs)}; f1 mean: {np.mean(fs)}')
    print(f'p/r/f1 stds --> precision std: {np.std(ps)}; recall std: {np.std(rs)}; f1 std: {np.std(fs)}')

    class_acc_mean, class_acc_std = np.mean(class_accs), np.std(class_accs)
    aug_acc_mean, aug_acc_std, aug_frac_mean = np.mean(aug_accs), np.std(aug_accs), np.mean(aug_fracs)
    p_mean, r_mean, f_mean = np.mean(ps), np.mean(rs), np.mean(fs)
    p_std, r_std, f_std = np.std(ps), np.std(rs), np.std(fs)
    
    # # ABLATION STUDY
    # print([round(s, 3) for s in sigmas])
    # print(f1_means)
    # print(f1_stds)
    # print(aug_acc_means)
    # print(aug_acc_stds)
    # print(frac_used_means)
    # print(frac_used_stds)
    # assert(False)

    return class_acc_mean, class_acc_std, aug_acc_mean, aug_acc_std, aug_frac_mean, p_mean, p_std, r_mean, r_std, f_mean, f_std
    def load_data(self, train_file, test_file=None, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        
        Inputs:
            train_file (String): path to training file
            test_file (String): path to test file
            val_file (String): path to validation file
        '''

        NLP = spacy.load('en_core_web_sm')
        tokenizer = lambda sent: [
            x.text for x in NLP.tokenizer(sent) if x.text != " "
        ]

        # Creating Field for data
        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          lower=True,
                          fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        # print(train_df)
        # exit()
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data from training data

        # train_data, val_data = train_data.split(split_ratio=0.9)

        TEXT.build_vocab(train_data)
        self.vocab = TEXT.vocab
        f = open("vocab.txt", "w")
        out = list(self.vocab.stoi.keys())
        for line in out:
            f.write(line + '\n')
        f.close()

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator = data.BucketIterator(
            test_data,
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} test examples".format(len(test_data)))
Exemple #26
0
    def translate_list(self,
                       src_sequences,
                       show_progbar=True,
                       n_jobs=1,
                       debug=False):
        """
        Given a list of sequences in the source language to
        translate to the target language, run them through the model and
        translate them.

        Parameters
        ----------
        src_sequences: List of str
            A list of str sequences in the source language to
            translate to the target language.

        show_progbar: boolean, optional (default=True)
            Whether or not to show a progress bar during translation.

        Returns
        -------
        target_sequences: List of str
            A list of str with the translation predictions for each sequence
            in src_sequences.
        """
        self.model.eval()
        # Convert the list of src_sequences to a list of Examples.
        src_examples = [
            data.Example.fromlist([src_sequence],
                                  [("source", self.source_field)])
            for src_sequence in src_sequences
        ]
        # Instantiate a Dataset object
        src_dataset = data.Dataset(src_examples, {"source": self.source_field})

        # Create batches on current GPU if CUDA is available, else CPU
        device = None if torch.cuda.is_available() else -1

        # Create an iterator over the source data
        src_iter = data.Iterator(dataset=src_dataset,
                                 batch_size=self.batch_size,
                                 device=device,
                                 sort=False,
                                 repeat=False,
                                 shuffle=False,
                                 train=False)
        # Run the data through the model to predict translations
        all_predicted_indices = []
        if show_progbar:
            predict_iter = tqdm(enumerate(src_iter),
                                total=len(src_iter),
                                file=sys.stdout,
                                desc="Prediction Batches")
        else:
            predict_iter = enumerate(src_iter)

        for predict_batch_idx, predict_batch in predict_iter:
            # Sort the source data and lengths by length, and translate it.
            source_data, source_lengths = predict_batch.source
            source_lengths, sort_indices = torch.sort(source_lengths,
                                                      -1,
                                                      descending=True)
            source_data = Variable(
                source_data.data.gather(1,
                                        sort_indices.expand_as(source_data)))

            predicted_indices, _ = self._translate_batch(
                source_data, source_lengths)
            # Reverse the sorting we did for compatibility with the model
            # to restore the original input ordering
            _, reverse_sort_indices = torch.sort(sort_indices, -1)
            predicted_indices = Variable(
                predicted_indices.data.gather(
                    1, reverse_sort_indices.expand_as(predicted_indices)))
            # Originally shape (seq_len, batch_size)
            # Transpose to shape (batch_size, seq_len), and then split into
            # tuple of length batch_size, with each element of shape (1, seq_len)
            all_predicted_indices.extend(
                predicted_indices.transpose(0, 1).split(1))

        self.model.train()

        # Convert the predicted indices to tokens with the target side vocab
        final_strings = []
        for seq_predicted_indices in all_predicted_indices:
            final_string = []
            for tok_idx in seq_predicted_indices.squeeze(0).data:
                if tok_idx == self.target_padding_idx or tok_idx == self.target_eos_idx:
                    break
                final_string.append(self.target_field.vocab.itos[tok_idx])
            if final_string[-1] == self.target_field.eos_token:
                final_strings.append(self._format_output(final_string[1:-1]))
            else:
                final_strings.append(self._format_output(final_string[1:]))
        return final_strings
Exemple #27
0
def prep_data(fake_headlines, real_headlines, embedding_length):
    random.seed(0)
    split_ratio = 0.15

    sentence = data.Field(sequential=True,
                          fix_length=embedding_length,
                          tokenize=data_processor.clean,
                          pad_first=True,
                          tensor_type=torch.LongTensor,
                          lower=True)

    label = data.Field(sequential=False,
                       use_vocab=False,
                       tensor_type=torch.ByteTensor)

    fields = [('sentence_text', sentence), ('label', label)]

    examples = []

    headlines = fake_headlines + real_headlines
    labels = [0] * len(fake_headlines) + [1] * len(real_headlines)

    for item in zip(headlines, labels):
        example = data.Example.fromlist(item, fields)
        examples.append(example)

    # random.shuffle(examples)

    sentence.build_vocab(data.Dataset(examples, fields),
                         min_freq=3,
                         vectors="glove.6B.100d")

    vocab = sentence.vocab

    embedding = torch.nn.Embedding(
        num_embeddings=len(vocab),
        embedding_dim=100,  #TODO: change depending on final used word2vec
    )
    embedding.weight.data.copy_(vocab.vectors)

    temp = list(zip(headlines, labels))
    random.shuffle(temp)
    headlines, labels = zip(*temp)

    test_split = int(len(headlines) * split_ratio)
    val_split = int(len(headlines) * split_ratio) + test_split

    train = headlines[val_split:]
    val = headlines[test_split:val_split]
    test = headlines[:test_split]

    train_labels = labels[val_split:]
    val_labels = labels[test_split:val_split]
    test_labels = labels[:test_split]

    train = sentence.process(train, -1, True)
    val = sentence.process(val, -1, False)
    test = sentence.process(test, -1, False)

    with open('embedding_layer.pkl', 'wb') as f:
        pickle.dump(embedding, f, pickle.HIGHEST_PROTOCOL)

    with open('vocabstoi.pkl', 'wb') as f:
        pickle.dump(vocab.stoi, f, pickle.HIGHEST_PROTOCOL)

    return train, val, test, train_labels, val_labels, test_labels, embedding, vocab.stoi
Exemple #28
0
    def __init__(self, args):
        path = 'data/test_path'
        dataset_path = path + '/Medmentions/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        print("preprocessing data files...")
        if not os.path.exists('{}/{}l'.format(path, args.train_file)):
            self.preprocess_file('{}/{}'.format(path, args.train_file))
        if not os.path.exists('{}/{}l'.format(path, args.dev_file)):
            self.preprocess_file('{}/{}'.format(path, args.dev_file))

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            'p_label': ('p_label', self.LABEL),
            'n_label': ('n_label', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'positive': [('p_word', self.WORD), ('p_char', self.CHAR)],
            'negative': [('n_word', self.WORD), ('n_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('p_label', self.LABEL),
                       ('n_label', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('p_word', self.WORD),
                       ('p_char', self.CHAR), ('n_word', self.WORD),
                       ('n_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples,
                                      fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train='{}l'.format(args.train_file),
                validation='{}l'.format(args.dev_file),
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if args.context_threshold > 0:
            self.train.examples = [
                e for e in self.train.examples
                if len(e.c_word) <= args.context_threshold
            ]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train,
                              self.dev,
                              vectors=GloVe(name='6B', dim=args.word_dim))

        print("building iterators...")
        device = torch.device(
            "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
        self.train_iter = data.BucketIterator(self.train,
                                              batch_size=args.train_batch_size,
                                              device=device,
                                              repeat=True,
                                              shuffle=True,
                                              sort_key=lambda x: len(x.c_word))

        self.dev_iter = data.BucketIterator(self.dev,
                                            batch_size=args.dev_batch_size,
                                            device=device,
                                            repeat=False,
                                            sort_key=lambda x: len(x.c_word))
Exemple #29
0
    # print(f'\n{model_name} test_acc={test_acc:.4f} oov_ratio={oov_ratio:.4f} oov_acc={oov_acc:.4f}')

    """ ORI'S MAIN: """

    # train_data_fn = 'en-ud-train.upos.tsv'
    # test_data_fn = 'en-ud-dev.upos.tsv'

    train_data_fn = 'train_small.tsv'
    test_data_fn = 'test_small.tsv'

    TEXT, TAGS = data.Field(lower=True), data.Field(unk_token=None)
    fields = [('text', TEXT), ('tags', TAGS)]

    # TRAIN HMM MODEL
    corpus = load_annotated_corpus(train_data_fn)
    hmm_model = learn_params(corpus)
    test = "You are such a good boy!"
    A = hmm_model[4]
    B = hmm_model[5]
    tagged_base = baseline_tag_sentence(word_tokenize(test), hmm_model[1], hmm_model[0])
    tagged_hmm = hmm_tag_sentence(word_tokenize(test), A, B)
    print(tagged_base)
    print(tagged_hmm)

    # GET TEST PERFORMANCE
    test_examples = get_examples_from_data(load_annotated_corpus(test_data_fn), fields)
    test_data = data.Dataset(test_examples, fields)
    test_hmm(hmm_model, test_data)

    print('\ndone')
Exemple #30
0
torch.manual_seed(config.RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 文本内容,使用自定义的分词方法,将内容转换为小写,设置最大长度等
TEXT = data.Field(tokenize=utils.en_seg, lower=True, fix_length=config.MAX_SENTENCE_SIZE, batch_first=True)
# 文本对应的标签
LABEL = data.LabelField(dtype=torch.float)

# 构建data数据
pos_examples, pos_fields = dataloader.get_dataset(config.POS_CORPUS_PATH, TEXT, LABEL, 'pos')
neg_examples, neg_fields = dataloader.get_dataset(config.NEG_CORPUS_PATH, TEXT, LABEL, 'neg')
all_examples, all_fields = pos_examples + neg_examples, pos_fields + neg_fields

# 构建torchtext类型的数据集
total_data = data.Dataset(all_examples, all_fields)

# 数据集切分
train_data, test_data = total_data.split(random_state=random.seed(config.RANDOM_SEED), split_ratio=0.8)

# 切分后的数据查看
# # 数据维度查看
print('len of train data: %r' % len(train_data))  # len of train data: 8530
print('len of test data: %r' % len(test_data))  # len of test data: 2132

# # 抽一条数据查看
print(train_data.examples[100].text)
# ['never', 'engaging', ',', 'utterly', 'predictable', 'and', 'completely', 'void', 'of', 'anything', 'remotely',
# 'interesting', 'or', 'suspenseful']
print(train_data.examples[100].label)
# 0