Ejemplo n.º 1
0
    def ptb(cls,
            text_field,
            batch_size=16,
            device=-1,
            vector: Optional[str] = None,
            **kwargs):

        train, valid, test = cls.splits(text_field, **kwargs)
        train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test),
                                                                       batch_sizes=(batch_size, batch_size, batch_size),
                                                                       device=device,
                                                                       shuffle=True,
                                                                       repeat=True,
                                                                       sort_key=lambda x:len(x.text),
                                                                       **kwargs)
        if vector == 'glove_6B':
            vectors = GloVe('6B', dim=300)
        elif vector == 'glove_840B':
            vectors = GloVe('840B', dim=300)
        elif vector == 'glove_42B':
            vectors = GloVe('42B', dim=300)

        try:
            text_field.build_vocab(train, valid, test, vectors=vectors)
        except UnboundLocalError:
            print('No word embedding loaded.')
            text_field.build_vocab(train, valid, test)

        return (iter(train_iter), iter(valid_iter), iter(test_iter)), text_field
Ejemplo n.º 2
0
    def test_vectors_get_vecs(self):
        vec = GloVe(name='twitter.27B', dim='25')
        self.assertEqual(vec.vectors.shape[0], len(vec))

        tokens = ['chip', 'baby', 'Beautiful']
        token_vecs = vec.get_vecs_by_tokens(tokens).numpy()
        self.assertEqual(token_vecs.shape[0], len(tokens))
        self.assertEqual(token_vecs.shape[1], vec.dim)
        assert_allclose(vec[tokens[0]].numpy(), token_vecs[0])
        assert_allclose(vec[tokens[1]].numpy(), token_vecs[1])
        assert_allclose(vec['<unk>'].numpy(), token_vecs[2])

        token_one_vec = vec.get_vecs_by_tokens(tokens[0],
                                               lower_case_backup=True).numpy()
        self.assertEqual(token_one_vec.shape[0], vec.dim)
        assert_allclose(vec[tokens[0].lower()].numpy(), token_one_vec)

        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            zip_file = os.path.join(self.project_root, ".vector_cache",
                                    "glove.6B.zip")
            conditional_remove(zip_file)
            for dim in ["50", "100", "200", "300"]:
                conditional_remove(
                    os.path.join(self.project_root, ".vector_cache",
                                 "glove.6B.{}d.txt".format(dim)))
    def __init__(self,
                 data_path,
                 alphabet_path,
                 is_labeled=True,
                 l0=501,
                 l1=131,
                 max_samples=None,
                 word_emb_name="twitter.27B",
                 word_emb_dim=200,
                 vector_cache_path=None):
        """A dataset object whose samples consist of *both*
            - the (padded) concatenation of the word vectors of a tweet, and
            - the per-character one-hot encoding of the same tweet.

        Arguments:
            data_path: path of (label and) data file in csv.
            alphabet_path: path of alphabet json file.
            is_labeled: whether the data_path file contains labels, or only the tweets.
            l1: max length of a sample, in nb of characters.
            l1: max length of a sample, in nb of words.
            max_samples: (for dev,) only keep the max_samples first samples of the data.

            word_emb_name: name of the word embedding to use, used by torchtext.GloVe.
            word_emb_dim: dimension of the word embedding to use, used by torchtext.GloVe.
            vector_cache_path: path to cache directory, used by torchtext.GloVe.
        """
        self.glove = GloVe(name=word_emb_name,
                           dim=word_emb_dim,
                           cache=vector_cache_path)
        print("loaded pretrained GloVe word-embeddings.")
        self.data_path = data_path
        self.alphabet_path = alphabet_path
        self.is_labeled = is_labeled
        self.l0 = l0
        self.l1 = l1
        with open(alphabet_path) as f:
            self.alphabet = ''.join(json.load(f))
        self.raw_nb_feats = len(self.alphabet)
        self.pro_nb_feats = word_emb_dim
        # TODO: setting max_samples only makes sense if the csv itself was shuffled
        # X_txt = pd.read_csv(data_path, nrows=max_samples) # only keep max_samples first samples, or keep all if None
        X_txt = pd.read_csv(data_path)
        if max_samples:
            assert is_labeled, "must not use `max_samples` for unlabeled (assumed test-) data, as shuffling would modify the samples' ordering"
            X_txt = X_txt.sample(frac=1).reset_index(
                drop=True
            ).iloc[:max_samples]  # shuffle then select max_samples first
        self.y = X_txt['label'].to_numpy().astype(
            np.integer, copy=False) if is_labeled else None
        self.X_pro = X_txt['preprocessed_segmented_tweet'].to_numpy()
        self.X_raw = X_txt['raw_tweet'].to_numpy()
Ejemplo n.º 4
0
    def _process_movie_fea(self):
        """

        Parameters
        ----------
        movie_info : pd.DataFrame
        name :  str

        Returns
        -------
        movie_features : np.ndarray
            Generate movie features by concatenating embedding and the year

        """
        if self._name == 'ml-100k':
            GENRES = GENRES_ML_100K
        elif self._name == 'ml-1m':
            GENRES = GENRES_ML_1M
        elif self._name == 'ml-10m':
            GENRES = GENRES_ML_10M
        else:
            raise NotImplementedError

        TEXT = data.Field(tokenize='spacy')
        embedding = GloVe(name='840B', dim=300)

        title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300),
                                   dtype=np.float32)
        release_years = np.zeros(shape=(self.movie_info.shape[0], 1),
                                 dtype=np.float32)
        p = re.compile(r'(.+)\s*\((\d+)\)')
        for i, title in enumerate(self.movie_info['title']):
            match_res = p.match(title)
            if match_res is None:
                print('{} cannot be matched, index={}, name={}'.format(
                    title, i, self._name))
                title_context, year = title, 1950
            else:
                title_context, year = match_res.groups()
            # We use average of glove
            title_embedding[i, :] = embedding.get_vecs_by_tokens(
                TEXT.tokenize(title_context)).numpy().mean(axis=0)
            release_years[i] = float(year)
        movie_features = np.concatenate(
            (title_embedding,
             (release_years - 1950.0) / 100.0, self.movie_info[GENRES]),
            axis=1)
        return movie_features
Ejemplo n.º 5
0
    def __init__(self, emb_dim=50, mbsize=32):
        self.TEXT = data.Field(init_token='<start>',
                               eos_token='<eos>',
                               lower=True,
                               tokenize='spacy',
                               fix_length=None)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        # Only take sentences with length <= 15
        f = lambda ex: len(ex.text) <= 15

        train, test = bookreader.BookReader.splits(self.TEXT,
                                                   self.LABEL,
                                                   filter_pred=f)

        self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim))
        self.LABEL.build_vocab(train)

        self.n_vocab = len(self.TEXT.vocab.itos)
        self.emb_dim = emb_dim

        self.train_iter, _ = data.BucketIterator.splits((train, test),
                                                        batch_size=mbsize,
                                                        device=-1,
                                                        shuffle=True)
Ejemplo n.º 6
0
def load_TREC_data(batch_size=32, embedding_length=100, fix_length=10):
    # set up fields
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=fix_length)
    # LABEL = data.LabelField()
    LABEL = data.LabelField(dtype=torch.float)

    # make splits for data
    train, test = datasets.TREC.splits(TEXT, LABEL)

    # build the vocabulary
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length))
    LABEL.build_vocab(train)

    # make iterator for splits
    train_iter, test_iter = data.BucketIterator.splits((train, test),
                                                       batch_size=batch_size,
                                                       device=0)

    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, test_iter
Ejemplo n.º 7
0
def load_datasets(test_sen=None):

    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(dtype=torch.float)
    train = data.TabularDataset(path='/content/query_classifier_data.csv', 
                        format='tsv', 
                        fields=[("question",TEXT),
                                ("label",LABEL)],  
                        skip_header=True)
    train_data, test_data = train.split(random_state=random.getstate())
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.question), repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
    def _initialize_glove(self) -> torch.Tensor:
        r"""
        Initialize embeddings of all the tokens in a given
        :class:`~allennlp.data.vocabulary.Vocabulary` by their GloVe vectors.

        Extended Summary
        ----------------
        It is recommended to train an :class:`~updown.models.updown_captioner.UpDownCaptioner` with
        frozen word embeddings when one wishes to perform Constrained Beam Search decoding during
        inference. This is because the constraint words may not appear in caption vocabulary (out of
        domain), and their embeddings will never be updated during training. Initializing with frozen
        GloVe embeddings is helpful, because they capture more meaningful semantics than randomly
        initialized embeddings.

        Returns
        -------
        torch.Tensor
            GloVe Embeddings corresponding to tokens.
        """
        glove = GloVe(name="42B", dim=300)
        glove_vectors = torch.zeros(self._vocabulary.get_vocab_size(), 300)

        for word, i in self._vocabulary.get_token_to_index_vocabulary().items(
        ):
            if word in glove.stoi:
                glove_vectors[i] = glove.vectors[glove.stoi[word]]
            elif word != self._pad_index:
                # Initialize by random vector.
                glove_vectors[i] = 2 * torch.randn(300) - 1

        return glove_vectors
Ejemplo n.º 9
0
    def __init__(self, vocab_size, max_len, embedding_size, hidden_size, use_glove,
            sos_id, eos_id, model_name, n_layers=1, rnn_cell='lstm', bidirectional=False,
            input_dropout_p=0, dropout_p=0, use_attention=False
        ):
        super(DecoderRNN, self).__init__(vocab_size, max_len, hidden_size,
                input_dropout_p, dropout_p,
                n_layers, rnn_cell)

        self.bidirectional_encoder = bidirectional   
        self.rnn = self.rnn_cell(embedding_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional)

        self.output_size = vocab_size
        self.max_length = max_len
        self.use_attention = use_attention
        self.eos_id = eos_id
        self.sos_id = sos_id

        self.init_input = None
        self.use_glove = use_glove
        
        if self.use_glove:
            embedding_glove = GloVe(name="twitter.27B", dim=200)
            self.embedding = embedding_glove.vectors
            self.rnn = self.rnn_cell(200, hidden_size, n_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional)
        else:          
            self.embedding = nn.Embedding(self.output_size, embedding_size)
        
        if use_attention:
            self.attention = Attention(self.hidden_size, model_name)

        self.out = nn.Linear(self.hidden_size, self.output_size)
Ejemplo n.º 10
0
    def __init__(self, emb_dim=50, mbsize=32, main=True, dataset2=None,
                 **kwargs):
        self.TEXT = data.Field(init_token='<start>', eos_token='<eos>',
                               lower=True, tokenize='spacy', fix_length=None)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        train, test = datasets.IMDB.splits(
            self.TEXT, self.LABEL, filter_pred=utils.filter(6)
        )

        self.train = train

        if main:
            train_datasets = [train.text, dataset2.get_train().text] \
                             if dataset2 else [train]
            self.TEXT.build_vocab(*train_datasets,
                                  vectors=GloVe('6B', dim=emb_dim))
            self.LABEL.build_vocab(train)

            self.n_vocab = len(self.TEXT.vocab.itos)
            print(self.n_vocab)
            self.emb_dim = emb_dim

            self.train_iter, _ = data.BucketIterator.splits(
                (train, test), batch_size=mbsize, device=-1, shuffle=True,
                repeat=True
            )

            self.train_iter = iter(self.train_iter)
Ejemplo n.º 11
0
    def _initialize_glove(self):
        assert self.embedding_size == 300
        glove = GloVe(name="42B", dim=self.embedding_size)

        caption_oov = 0
        glove_caption_tokens = torch.zeros(self._vocabulary.get_vocab_size(),
                                           self.embedding_size)
        for word, i in self._vocabulary.get_token_to_index_vocabulary().items(
        ):
            if word in glove.stoi:
                glove_caption_tokens[i] = glove.vectors[glove.stoi[word]]
            else:  # use a random vector instead
                caption_oov += 1
                glove_caption_tokens[i] = 2 * torch.randn(
                    self.embedding_size) - 1
        print("Caption OOV: %d / %d = %.2f" %
              (caption_oov, self.vocab_size,
               100 * caption_oov / self.vocab_size))

        for p in self._output_layer.parameters():
            p.requires_grad = False
        self._output_layer.weight.copy_(glove_caption_tokens)

        for p in self._embedding_layer.parameters():
            p.requires_grad = False
        self._embedding_layer.weight.copy_(glove_caption_tokens)
    def iters(cls, batch_size=32, device=-1):
        TEXT = data.Field(include_lengths=True)
        LABEL = data.Field(sequential=False, use_vocab=False)
        ID = data.Field(sequential=False)

        train, val, test = cls.splits(path='.',
                                      train='train.csv',
                                      skip_header=True,
                                      validation='val.csv',
                                      test='dev.csv',
                                      format='csv',
                                      fields=[
                                          ('id', ID), ('project_title', TEXT),
                                          ('project_resource_summary', TEXT),
                                          ('project_essay_1', TEXT),
                                          ('project_essay_2', TEXT),
                                          ('project_is_approved', LABEL)
                                      ])

        #vocab is shared across all the text fields
        #CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
        TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
        ID.build_vocab(train)

        return data.BucketIterator.splits((train, val, test),
                                          batch_size=batch_size,
                                          device=device)
Ejemplo n.º 13
0
def load_data(path, batch_size, max_seq_length, glove="840B", emb_size=300):
    TEXT = Field(sequential=True, fix_length=max_seq_length, lower=True)
    LABEL = Field(sequential=False, use_vocab=False)
    ID = Field(sequential=False, use_vocab=False)

    data_fields = [("id", ID), ("sent", TEXT), ("label", LABEL)]
    train_path = os.path.join(path, "train.csv")
    train = TabularDataset(path=train_path,
                           format="csv",
                           skip_header=False,
                           fields=data_fields)
    test_path = os.path.join(path, "dev.csv")
    test = TabularDataset(path=test_path,
                          format="csv",
                          skip_header=False,
                          fields=data_fields)

    TEXT.build_vocab(train, vectors=GloVe(name=glove, dim=emb_size))
    LABEL.build_vocab(train)

    vocab_size = len(TEXT.vocab)
    vocab_weights = TEXT.vocab.vectors

    train_iter = BucketIterator(dataset=train,
                                batch_size=batch_size,
                                sort_key=lambda x: x.id,
                                shuffle=True,
                                repeat=False)
    test_iter = BucketIterator(dataset=test,
                               batch_size=batch_size,
                               sort_key=lambda x: x.id,
                               shuffle=False,
                               repeat=False)

    return train_iter, test_iter, vocab_size, vocab_weights
Ejemplo n.º 14
0
def load_imdb(args):
    TEXT = data.Field(lower=True,
                      tokenize=tokenize,
                      batch_first=True,
                      fix_length=args.max_seq_len)
    LABEL = data.LabelField(dtype=torch.long)

    train_data, test_data = datasets.IMDB.splits(TEXT,
                                                 LABEL,
                                                 root=args.data_dir)

    # build a vocabulary
    TEXT.build_vocab(train_data,
                     max_size=args.max_vocab_size - 2,
                     vectors=GloVe(name='6B', dim=args.embedding_size))
    LABEL.build_vocab(train_data)

    train_iter, test_iter = data.BucketIterator.splits(
        (train_data, test_data),
        batch_size=args.batch_size,
        sort_key=lambda x: len(x.text))

    n_token = len(TEXT.vocab)
    n_classes = len(LABEL.vocab)
    print("{} unique tokens in TEXT vocabulary".format(n_token))
    print("{} class labels".format(n_classes))

    return train_iter, test_iter, n_token, n_classes, TEXT.vocab.vectors
Ejemplo n.º 15
0
    def __init__(self, args):
        super(ARC, self).__init__(args)
        self.LABEL = data.Field(sequential=False, unk_token=None, tensor_type=torch.FloatTensor)

        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='.data/arc/preprocessed/single',
            train='train.txt',
            validation='dev.txt',
            test='test.txt',
            format='tsv',
            skip_header=True,
            fields=[('id', self.RAW),
                    ('warrant', self.TEXT),
                    ('label', self.LABEL),
                    ('reason', self.TEXT),
                    ('claim', self.TEXT),
                    ('debateTitle', self.TEXT),
                    ('debateInfo', self.TEXT)])

        self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300))
        self.LABEL.build_vocab(self.train)

        self.sort_key = lambda x: len(x.warrant) + len(x.reason) + len(x.claim)
        self.train_iter, self.dev_iter, self.test_iter = data.Iterator.splits(
            (self.train, self.dev, self.test),
            batch_sizes=[self.args.batch_size, 256, 256],
            device=self.args.gpu,
            sort_key=self.sort_key)

        self.dev_iter.sort = False
        self.dev_iter.sort_within_batch = False
        self.test_iter.sort = False
        self.test_iter.sort_within_batch = False
    def __init__(self, emb_dim=50, mbsize=32):
        # self.TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
        # self.LABEL = data.LabelField(dtype = torch.float)

        self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        # Only take sentences with length <= 15
        f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral'

        train, test = datasets.IMDB.splits(
            self.TEXT, self.LABEL
        )

        self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim))
        self.LABEL.build_vocab(train)

        self.n_vocab = len(self.TEXT.vocab.itos)
        self.emb_dim = emb_dim

        self.train_iter, self.test_iter = data.BucketIterator.splits(
            (train, test), batch_size=mbsize, device=-1,
            shuffle=True, repeat=True
        )
        self.train_iter = iter(self.train_iter)
        self.test_iter = iter(self.test_iter)
def load_data(batch_size=16, embedding_length=100):
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=30)
    LABELS = data.LabelField(batch_first=True, dtype=torch.float)

    train, val, test = data.TabularDataset.splits(
        path='app/data/sentiment_data/',
        train='train.tsv',
        validation='dev.tsv',
        test='test.tsv',
        format='tsv',
        fields=[('text', TEXT), ('labels', LABELS)])

    # train_iter, val_iter, test_iter = data.BucketIterator.splits(
    #   (train, val, test), batch_sizes=(batch_size, batch_size, batch_size), sort_key=lambda x: len(x.text), device=0)

    # # build the vocabulary
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length))
    # LABELS.build_vocab(train)
    # print(LABELS.vocab.__dict__)

    # word_embeddings = TEXT.vocab.vectors
    # vocab_size = len(TEXT.vocab)

    return TEXT
def load_dataset(test_sen=None):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 40.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """

    #     tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize_en,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=40)
    LABEL = data.LabelField(dtype=torch.float)
    fields = [(None, None), (None, None), ('text', TEXT), ('label', LABEL)]
    train_data, valid_data, test_data = data.TabularDataset.splits(
        path='',
        train='V1.4_Training_original.csv',
        validation='SubtaskA_Trial_Test_Labeled - Copy.csv',
        test='SubtaskA_EvaluationData_labeled.csv',
        #                                         train = 'train_spacy.csv',
        #                                         validation = 'valid_spacy.csv',
        #                                         test = 'test_spacy.csv',
        #                                         #sort_key=lambda x: len(x.Text),
        format='csv',
        fields=fields,
        skip_header=True)
    print(vars(train_data[0]))
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=100))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    #     train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=64,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True,
        device=device)
    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Ejemplo n.º 19
0
    def __init__(self, args):

        super().__init__(args)

        fix_length = args.max_sent_len if args.max_sent_len >= 0 else None

        self.TEXT = data.Field(batch_first=True,
                               init_token="<s>",
                               eos_token="</s>",
                               preprocessing=preprocessor,
                               fix_length=fix_length,
                               include_lengths=True,
                               tokenize="spacy")
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = datasets.SNLI.splits(
            self.TEXT, self.LABEL)

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='840B', dim=300))
        self.build_char_vocab()
        self.LABEL.build_vocab(self.train)

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_sizes=[args.batch_size] * 3,
                                       device=torch.device('cuda', args.gpu) if args.gpu >= 0 else torch.device('cpu'),
                                       repeat=False)
Ejemplo n.º 20
0
    def __init__(self, batch_size, word_dim):
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='.data/quora',
            train='train.tsv',
            validation='dev.tsv',
            test='test.tsv',
            format='tsv',
            fields=[('label', self.LABEL), ('q1', self.TEXT),
                    ('q2', self.TEXT), ('id', self.RAW)])

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='6B', dim=word_dim))
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       device=-1,
                                       batch_sizes=[batch_size] * 3,
                                       sort_key=sort_key)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
Ejemplo n.º 21
0
    def __init__(self, emb_dim=50, mbsize=32):
        self.TEXT = data.Field(init_token='<start>',
                               eos_token='<eos>',
                               lower=True,
                               tokenize='spacy',
                               fix_length=16)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        # Only take sentences with length <= 15
        f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral'

        train, val, test = datasets.SST.splits(self.TEXT,
                                               self.LABEL,
                                               fine_grained=False,
                                               train_subtrees=False,
                                               filter_pred=f)

        self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim))
        self.LABEL.build_vocab(train)

        self.n_vocab = len(self.TEXT.vocab.itos)
        self.emb_dim = emb_dim

        self.train_iter, self.val_iter, _ = data.BucketIterator.splits(
            (train, val, test),
            batch_size=mbsize,
            device=-1,
            shuffle=True,
            repeat=True)
        self.train_iter = iter(self.train_iter)
        self.val_iter = iter(self.val_iter)
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default=None, type=str, required=True)
    parser.add_argument('--w2v_path', default=None, type=str, required=True)
    parser.add_argument('--labels', default=None, type=str, required=True)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--epochs', default=3, type=int)
    parser.add_argument('--logging_steps', default=20, type=int)
    parser.add_argument('--learning_rate', default=5e-3, type=float)
    args = parser.parse_args()

    args.device = torch.device('cuda')

    labels = get_labels(args.labels)
    glove = GloVe(cache=args.w2v_path)

    # model
    model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512)
    model.to(args.device)

    # dataset
    train_dataset = NerDataset(args.data_dir, labels, glove, mode='train')
    eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev')

    # train
    train(args, model, train_dataset)

    # eval
    result = eval(args, model, eval_dataset, labels)

    print(result)
Ejemplo n.º 23
0
    def vocab_builder(self):
        #self.eid_field = Field(sequential=False,tokenize)

        print('Build Vocabulary')
        tokenize = BiGraphTextDataset.tokenize_text
        TEXT = Field(sequential=True,
                     tokenize=tokenize,
                     lower=True,
                     include_lengths=True,
                     batch_first=True,
                     fix_length=35,
                     use_vocab=True)

        datafields = [('eid', None), ('idxP', None), ('idxC', None),
                      ('MaxDegree', None), ('MaxL', None), ('text', TEXT)]
        path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt'
        train_data = TabularDataset(path=path,
                                    format='tsv',
                                    skip_header=False,
                                    fields=datafields)
        TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))

        #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
        self.stoi_dict = TEXT.vocab.stoi
        self.vocab_vectors = TEXT.vocab.vectors
Ejemplo n.º 24
0
def get_all_vectors(pretrained_model):
    emb_vectors = []

    if pretrained_model == "":
        return emb_vectors

    emb_vector_names = pretrained_model.split(",")
    for emb_vector_name in emb_vector_names:
        emb_info = emb_vector_name.split("_")
        if len(emb_info) == 3:
            emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2]
        else:
            emb_name, emb_set = emb_info[0], emb_info[1]

        if emb_name == "glove":  # glove_640B_300
            print("glove")
            emb_vectors.append(GloVe(name=emb_set, dim=emb_size))
        elif emb_name == "fasttext":
            if emb_set == "subwordcc":  # fasttext_subwordcc
                print("fasttext_subwordcc")
                emb_vectors.append(FastTextSubwordCC())
            elif emb_set == "wiki":  # fasttext_wiki_en
                print("fasttext_wiki")
                emb_vectors.append(FastText(language=emb_size))
            elif emb_set == "cc":  # fasttext_cc_en
                print("fasttext_cc")
                emb_vectors.append(FastTextCC(language=emb_size))
        elif emb_name == "char":  # char_ngram
            if emb_set == "ngram":
                print("char_ngram")
                emb_vectors.append(CharNGram())
    return emb_vectors
Ejemplo n.º 25
0
    def __init__(self, path, text_field, label_field, pad_length, eval):
        """Create an IMDB dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """

        self.eval = eval
        self.dataset = []

        dataframe = pd.read_csv(path, sep="\t", encoding="utf-8")

        for i in range(dataframe.shape[0]):
            self.dataset.append({
                'text': text_field.preprocess(dataframe['text'][i]),
                'label': dataframe['label'][i]
            })

        if not self.eval:
            text_field.build_vocab(
                [t['text'] for t in dataset],
                vectors=GloVe(name='6B', dim=args.embedding_dims)
            )
            label_field.build_vocab(t['label'] for t in dataset)

        self.TEXT = text_field
        self.LABEL = label_field
        self.pad_length = pad_length
Ejemplo n.º 26
0
def get_data():
    # set up fields
    TEXT = data.Field(lower=True,
                      include_lengths=True,
                      batch_first=True,
                      tokenize='spacy')
    LABEL = data.Field(sequential=False)

    # make splits for data
    print("Accessing raw input and preprocessing")
    train, val, test = datasets.SNLI.splits(TEXT,
                                            LABEL,
                                            root='.data',
                                            train='snli_1.0_train.jsonl',
                                            validation='snli_1.0_dev.jsonl',
                                            test='snli_1.0_test.jsonl')
    print("done")

    # build the vocabulary
    print("Building vocabulary with GloVe")
    TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=300))
    LABEL.build_vocab(train)
    print("done")

    # make iterator for splits
    print("Loading data into iterables")
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_size=64, device="cuda")
    print("done, returning data")
    ### text contains metadata, returning it
    return train_iter, val_iter, test_iter, TEXT, LABEL
Ejemplo n.º 27
0
def imdb_detail_get():
    # set up fields
    TEXT = data.Field(lower=True,
                      include_lengths=True,
                      batch_first=True,
                      tokenize=(lambda x: x))
    # TEXT = data.Field(sequential=False)
    LABEL = data.Field(sequential=False)

    # make splits for data
    train, test = datasets.IMDB.splits(TEXT, LABEL)

    # print information about the data
    print('>>> train.fields', train.fields)
    print('>>> len(train)', len(train))
    print('>>> vars(train[0])', vars(train[0]))
    print('>>> vars(test[0])', vars(test[0]))

    # build the vocabulary
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)

    # print vocab information
    print('>>> len(TEXT.vocab)', len(TEXT.vocab))
    print('>>> TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

    return train, test
Ejemplo n.º 28
0
def get_dataset(train_df, valid_df, batch_size, cache=None, gpus=1, vectors=None):
    TEXT = data.Field(init_token='<START>', eos_token='<END>', tokenize=None, tokenizer_language='en',
                      batch_first=True, lower=True, stop_words=set(string.punctuation))
    LABEL = data.Field(dtype=torch.float, is_target=True, unk_token=None, sequential=False, use_vocab=False)

    train_dataset = DataFrameDataset(train_df, {
        'text': TEXT,
        'label': LABEL
    })

    val_dataset = DataFrameDataset(valid_df, {
        'text': TEXT,
        'label': LABEL
    })

    train_loader, val_loader = BucketIterator.splits(
        (train_dataset, val_dataset),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        device='cuda' if torch.cuda.is_available() and gpus else 'cpu'
    )

    embeddings = vectors if vectors is not None else GloVe('42B', cache=cache)
    TEXT.build_vocab(train_dataset.text, vectors=embeddings)

    return TEXT, LABEL, train_loader, val_loader
Ejemplo n.º 29
0
    def __init__(self, args):
        self.TEXT = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = datasets.SNLI.splits(
            self.TEXT, self.LABEL)

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='840B', dim=300))
        self.LABEL.build_vocab(self.train)

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_sizes=[args.batch_size] * 3,
                                       device=args.gpu)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.use_char_emb:
            self.build_char_vocab()
Ejemplo n.º 30
0
    def _data_reader(self):

        if self.datatype == "test":
            dataset = data.TabularDataset(path=self.data_path,
                                          format='csv',
                                          fields=[('label', self.LABEL),
                                                  ('text', self.TEXT)])
            self.LABEL.build_vocab(dataset)
            self.itol = self.LABEL.vocab.itos
        elif self.datatype == "train":
            dataset = data.TabularDataset(path=self.data_path,
                                          format='csv',
                                          fields=[('text', self.TEXT)])
        else:
            raise Exception("datatype other than train or test...")

        self.TEXT.build_vocab(dataset,
                              vectors=GloVe(name=self.glove,
                                            dim=self.embed_dim,
                                            cache=self.vectors_path))

        self.dataset = dataset
        self.data_size = len(dataset)
        self.embed_matrix = self.TEXT.vocab.vectors
        self.word_size = len(self.embed_matrix)
        self.stoi = self.TEXT.vocab.stoi
        self.itos = self.TEXT.vocab.itos