Exemple #1
0
def get_all_vectors(pretrained_model):
    emb_vectors = []

    if pretrained_model == "":
        return emb_vectors

    emb_vector_names = pretrained_model.split(",")
    for emb_vector_name in emb_vector_names:
        emb_info = emb_vector_name.split("_")
        if len(emb_info) == 3:
            emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2]
        else:
            emb_name, emb_set = emb_info[0], emb_info[1]

        if emb_name == "glove":  # glove_640B_300
            print("glove")
            emb_vectors.append(GloVe(name=emb_set, dim=emb_size))
        elif emb_name == "fasttext":
            if emb_set == "subwordcc":  # fasttext_subwordcc
                print("fasttext_subwordcc")
                emb_vectors.append(FastTextSubwordCC())
            elif emb_set == "wiki":  # fasttext_wiki_en
                print("fasttext_wiki")
                emb_vectors.append(FastText(language=emb_size))
            elif emb_set == "cc":  # fasttext_cc_en
                print("fasttext_cc")
                emb_vectors.append(FastTextCC(language=emb_size))
        elif emb_name == "char":  # char_ngram
            if emb_set == "ngram":
                print("char_ngram")
                emb_vectors.append(CharNGram())
    return emb_vectors
Exemple #2
0
def main(params):
    # build dataset
    train_data = pd.read_csv('./data/train_final.csv')
    tokenizer = get_tokenizer('spacy', language='en')

    if params.emb_type == "GloVe":
        embedding = GloVe(
            name=params.emb_data, dim=params.emb_dim
        )  # use glove embedding with default option(name='840B', dim=300)
    elif params.emb_type == "CharNGram":
        embedding = CharNGram()
    elif params.emb_type == "FastText":
        embedding = FastText(name=params.emb_data, dim=params.emb_dim)
    else:
        print("Wrong embedding type")
        exit()

    train_data, val_data = train_data[1000:], train_data[:1000]
    train_dataset = SentimentDataset(train_data, tokenizer, embedding)
    val_dataset = SentimentDataset(val_data, tokenizer, embedding)

    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    model = SentimentClassificationModel(params.emb_dim, params.hidden_dim,
                                         params.dropout).to(device)
    crit = nn.CrossEntropyLoss().to(device)
    optim = torch.optim.Adam(params=model.parameters(), lr=1e-3)

    best_val_acc = 0
    early_stop_cnt = 0
    epoch = 0
    train_loss_list = []
    train_acc_list = []
    val_acc_list = []
    while early_stop_cnt != 5:
        loss_list, train_acc = train.trainer(epoch, model, train_dataloader,
                                             crit, optim, device)
        val_acc = train.eval(epoch, model, val_dataloader, device, False)
        if val_acc > best_val_acc and epoch > 0:
            torch.save(model.state_dict(), './model/lstm_best.pt')
            best_val_acc = val_acc
            early_stop_cnt = 0

        early_stop_cnt += 1
        epoch += 1
        train_loss_list.extend(loss_list)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)

    print("Early stopping condition satisfied")
    plotting("train_loss", "steps", "loss", train_loss_list)
    plotting("train_accuracy", "epoch", "accuracy", train_acc_list)
    plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
    def get_vectors(self, vocab):
        sources = None
        if self.wordvec_source == 'glove':
            sources = ['GloVe']
        elif self.wordvec_source == 'charlevel':
            sources = ['GloVe', 'charLevel']
        elif self.wordvec_source == 'google':
            sources = ['googlenews']
        elif self.wordvec_source == 'gigavec':
            sources = ['gigavec']
        else:
            sources = []

        print('Building Vocab...')

        if isinstance(vocab, Vocab):
            print("Using Pretrained Vocab")
            self.sentence_field.vocab = vocab
            print(len(self.sentence_field.vocab.itos))
        else:
            print('wrong')
            vecs = []
            print('Loading Vectors From Memory...')
            if self.pretrained_vecs:
                print('Using these vectors: ' + str(self.wordvec_source))
                for source in sources:
                    if source == 'GloVe':
                        glove = Vectors(name='glove.6B.{}d.txt'.format(
                            self.glove_dim),
                                        cache=self.vector_cache)
                        vecs.append(glove)
                        self.wordvec_dim += self.glove_dim
                    if source == 'charLevel':
                        charVec = CharNGram()
                        self.wordvec_dim += 100
                    if source == 'googlenews':
                        googlenews = Vectors(name = 'googlenews.txt',\
                                cache = self.vector_cache)
                        vecs.append(googlenews)
                        self.wordvec_dim += 300
                    if source == 'gigavec':
                        gigavec = Vectors(name = 'gigamodel.vec',\
                                cache = self.vector_cache)
                        vecs.append(gigavec)
                        self.wordvec_dim += 300

            if isinstance(vocab, Counter):
                self.sentence_field.vocab = Vocab(vocab,
                                                  vectors=vecs,
                                                  max_size=self.max_vocab)
            else:
                self.sentence_field.build_vocab(self.train_sentences, vectors = vecs, \
                        max_size = self.max_vocab, min_freq = MIN_FREQ)
                print('Found {} tokens'.format(len(self.sentence_field.vocab)))

        if self.tie_weights:
            self.hidden_size = self.wordvec_dim
    def get_vectors(self):
        if self.glove:
            print('Downloading GloVe Vectors...')
            glove = GloVe(name='6B', cache='vectors')
            print('Done.')

        if self.charngram:
            print('Downloading CharNGram Vectors...')
            charVec = CharNGram(cache='vectors')
            print('Done.')
Exemple #5
0
    def test_vocab_download_charngram_vectors(self):
        c = Counter({
            'hello': 4,
            'world': 3,
            'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
            'freq_too_low': 2
        })
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = "charngram.100d"
            else:
                vectors = CharNGram()
            v = vocab.Vocab(c,
                            min_freq=3,
                            specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)
            expected_itos = [
                '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'
            ]
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_charngram = {
                'hello': [
                    -0.44782442, -0.08937783, -0.34227219, -0.16233221,
                    -0.39343098
                ],
                'world': [
                    -0.29590717, -0.05275926, -0.37334684, 0.27117205,
                    -0.3868292
                ],
            }

            for word in expected_charngram:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_charngram[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(100))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(100))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache",
                             "charNgram.txt"))
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache",
                             "jmt_pre-trained_embeddings.tar.gz"))
Exemple #6
0
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()])
LABEL.build_vocab(train)

train_iter, test_iter = datasets.TREC.iters(batch_size=4)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
Exemple #7
0
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=3)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
TEXT.build_vocab(
    train, vectors=[GloVe(name='840B', dim='300'),
                    CharNGram(),
                    FastText()])
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 3:
Exemple #8
0
def nyt_ingredients_ner_dataset(batch_size,
                                use_local=False,
                                root='.data/nyt_ingredients_ner',
                                train_file='train.txt',
                                validation_file='valid.txt',
                                test_file='test.txt',
                                convert_digits=True):
    """
    nyt_ingredients_ner: New York Times Ingredient tagging dataset
    Extract NYT ingredients dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        batch_size: Batch size to return from iterator
        use_local: If True use local provided files (default False)
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's

    Returns:
        A dict containing:
            task: 'nyt_ingredients.ner'
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list,
                                     init_token="<bos>",
                                     eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>",
                                   eos_token="<eos>")

    labels = data.Field(init_token="<bos>",
                        eos_token="<eos>",
                        batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)),
               ('labels', labels)])

    # Load the data
    if use_local:
        train, val, test = SequenceTaggingDataset.splits(
            path=root,
            train=train_file,
            validation=validation_file,
            test=test_file,
            fields=tuple(fields))
    else:
        train, val, test = Ingredients.splits(fields=tuple(fields))

    logger.info('---------- NYT INGREDIENTS NER ---------')
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char,
                            test.inputs_char)
    inputs_word.build_vocab(train.inputs_word,
                            val.inputs_word,
                            test.inputs_word,
                            max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'),
                                     CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'nyt_ingredients.ner',
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }
Exemple #9
0
def conll2003_dataset(tag_type, batch_size, root='./conll2003',
                      train_file='eng.train.txt',
                      validation_file='eng.testa.txt',
                      test_file='eng.testb.txt',
                      convert_digits=True):
    """
    conll2003: Conll 2003 (Parser only. You must place the files)
    Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        tag_type: Type of tag to pick as task [pos, chunk, ner]
        batch_size: Batch size to return from iterator
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's

    Returns:
        A dict containing:
            task: 'conll2003.' + tag_type
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary,
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True,
                             preprocessing=data.Pipeline(
                                 lambda w: '0' if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>", eos_token="<eos>")

    labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] +
              [('labels', labels) if label == tag_type else (None, None)
               for label in ['pos', 'chunk', 'ner']])

    # Load the data
    train, val, test = SequenceTaggingDataset.splits(
        path=root,
        train=train_file,
        validation=validation_file,
        test=test_file,
        separator=' ',
        fields=tuple(fields))

    logger.info('---------- CONLL 2003 %s ---------' % tag_type)
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char)
    inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'), CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'conll2003.%s' % tag_type,
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }
Exemple #10
0
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test),
                                                   batch_size=3,
                                                   device=device)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
TEXT.build_vocab(train,
                 vectors=[GloVe(name='840B', dim='300'),
                          CharNGram()],
                 device=device)
LABEL.build_vocab(train)

train_iter, test_iter = datasets.TREC.iters(batch_size=4)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
Exemple #11
0
def conll2000_dataset(batch_size,
                      use_local=False,
                      root='.data/conll2000',
                      train_file='train.txt',
                      test_file='test.txt',
                      validation_frac=0.1,
                      convert_digits=True):
    """
    conll2000: Conll 2000 (Chunking)
    Extract Conll2000 Chunking dataset using torchtext. By default will fetch
    data files from online repository.
    Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets 
    up per word character Field
    Parameters:
        batch_size: Batch size to return from iterator
        use_local: If True use local provided files (default False)
        root (optional): Dataset root directory (needed only if use_local is True)
        train_file (optional): Train filename (needed only if use_local is True)
        test_file (optional): Test filename (needed only if use_local is True)
        validation_frac (optional): Fraction of train dataset to use for validation
        convert_digits (optional): If True will convert numbers to single 0's
    NOTE: Since there is only a train and test set we use 10% of the train set as
        validation
    Returns:
        A dict containing:
            task: 'conll2000.' + tag_type
            iters: (train iter, validation iter, None)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list,
                                     init_token="<bos>",
                                     eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>",
                                   eos_token="<eos>")

    labels = data.Field(init_token="<bos>",
                        eos_token="<eos>",
                        batch_first=True)

    fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)),
              (None, None), ('labels', labels)]

    if use_local:
        # Load the data
        train, test = SequenceTaggingDataset.splits(path=root,
                                                    train=train_file,
                                                    test=test_file,
                                                    fields=tuple(fields))

        # HACK: Saving the sort key function as the split() call removes it
        sort_key = train.sort_key
        # To make the split deterministic
        random.seed(0)
        train, val = train.split(1 - validation_frac,
                                 random_state=random.getstate())
        # Reset the seed
        random.seed()

        # HACK: Set the sort key
        train.sort_key = sort_key
        val.sort_key = sort_key
    else:
        train, val, test = CoNLL2000Chunking.splits(
            fields=tuple(fields), validation_frac=validation_frac)

    logger.info('---------- CONLL 2000 Chunking ---------')
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char,
                            test.inputs_char)
    inputs_word.build_vocab(train.inputs_word,
                            val.inputs_word,
                            test.inputs_word,
                            max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'),
                                     CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'conll2000.chunk',
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }