Beispiel #1
0
def generate_iterators(BATCH_SIZE=32,
                       MAX_LEN=20,
                       load_data=False,
                       embedding=None):
    if not load_data:
        spacy_de = spacy.load('de')
        spacy_en = spacy.load('en')

        def tokenize_de(text):
            return [tok.text for tok in spacy_de.tokenizer(text)]

        def tokenize_en(text):
            return [tok.text for tok in spacy_en.tokenizer(text)]

        BOS_WORD = '<s>'
        EOS_WORD = '</s>'
        DE = data.Field(tokenize=tokenize_de)
        EN = data.Field(tokenize=tokenize_en,
                        init_token=BOS_WORD,
                        eos_token=EOS_WORD)  # only target needs BOS/EOS

        train, val, test = datasets.IWSLT.splits(
            exts=('.de', '.en'),
            fields=(DE, EN),
            filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
                vars(x)['trg']) <= MAX_LEN)
        MIN_FREQ = 5
        DE.build_vocab(train.src, min_freq=MIN_FREQ)
        EN.build_vocab(train.trg, min_freq=MIN_FREQ)
        if embedding is not None:
            if embedding in ['FastText', 'fasttext']:
                EN.vocab.load_vectors(vectors=FastText(language='en'))
                DE.vocab.load_vectors(vectors=FastText(language='de'))
            else:
                raise ValueError("Only fasttext is supported at the moment")
        train_iter, val_iter = data.BucketIterator.splits(
            (train, val),
            batch_size=BATCH_SIZE,
            device=-1,
            repeat=False,
            sort_key=lambda x: len(x.src))

        return train_iter, val_iter, EN, DE
    else:  # does not work...
        with open('train.pkl', 'rb') as f:
            train = pickle.load(f)
        with open('val.pkl', 'rb') as f:
            val = pickle.load(f)
        with open('DE.torchtext.Field.pkl', 'rb') as f:
            DE = pickle.load(f)
        with open('EN.torchtext.Field.pkl', 'rb') as f:
            EN = pickle.load(f)
        BATCH_SIZE = 32
        train_iter, val_iter = data.BucketIterator.splits(
            (train, val),
            batch_size=BATCH_SIZE,
            device=-1,
            repeat=False,
            sort_key=lambda x: len(x.src))
        return train_iter, val_iter, EN, DE
Beispiel #2
0
    def prepare_data(self):
        e, m, x = read_conll(self.param.training_file_path)

        max_token_length = 0
        for i in range(len(e)):
            current_sentence_len = len(e[i].sentence.tokens)
            if current_sentence_len > max_token_length:
                max_token_length = current_sentence_len

        # reverse int to tokens
        sentences = list()
        sentences_postags = list()
        sentences_lemmas = list()
        labels = list()

        for i in range(int(x)):
            sentences.append(
                [VOCDICT.getstr(token) for token in e[i].sentence.tokens])
            sentences_postags.append(
                [POSDICT.getstr(postag) for postag in e[i].sentence.postags])
            sentences_lemmas.append(
                [LEMDICT.getstr(lemma) for lemma in e[i].sentence.lemmas])
            labels.append(list(e[i].targetframedict.keys()))

        tokens_field = Field(sequential=True, fix_length=max_token_length)
        postags_field = Field(sequential=True, fix_length=max_token_length)
        lemmas_field = Field(sequential=True, fix_length=max_token_length)

        tokens_field.build_vocab(sentences, vectors=FastText('simple'))
        postags_field.build_vocab(sentences_postags)
        lemmas_field.build_vocab(sentences_lemmas, vectors=FastText('simple'))

        self.pretrained_embedding = tokens_field.vocab.vectors

        def _preprocess_field(l: list) -> list:
            return [1 if j in l else 0 for j in range(max_token_length)]

        labels_field = Field(sequential=False,
                             use_vocab=False,
                             preprocessing=_preprocess_field,
                             is_target=True)

        train, val = FrameTargetDataset(sentences,
                                        sentences_postags,
                                        sentences_lemmas,
                                        labels,
                                        fields=[
                                            ('tokens', tokens_field),
                                            ('postags', postags_field),
                                            ('lemmas', lemmas_field),
                                            ('labels', labels_field),
                                        ]).split()

        self.train_iter, self.val_iter = BucketIterator.splits(
            datasets=(train, val),
            batch_sizes=(self.batch_size, self.batch_size),
            device=self._d,
            sort=False)
Beispiel #3
0
    def __init__(self, max_len, batch_size, max_epochs, device, pretrained):
        text_field = data.Field(lower=True,
                                batch_first=True,
                                fix_length=max_len,
                                init_token='<go>',
                                eos_token='<eos>',
                                unk_token='<unk>',
                                pad_token='<pad>')
        label_field = data.Field(fix_length=max_len - 1, batch_first=True)

        # make splits for data
        unsup_train, unsup_val, unsup_test = NLIGen.splits(text_field)
        train, val, test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))

        # build the vocabulary
        text_field.build_vocab(
            unsup_train)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)

        # make iterator for splits
        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size / 10),
            device=device,
            shuffle=True,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=False,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, unsup_val, unsup_test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
    def __init__(self, emb_dim=50, mbsize=32, main=True, dataset2=None,
                 **kwargs):
        self.TEXT = data.Field(init_token='<start>', eos_token='<eos>',
                               lower=True, tokenize='spacy', fix_length=16)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        train, val, test = datasets.SST.splits(
            self.TEXT, self.LABEL, fine_grained=False, train_subtrees=False,
            filter_pred=utils.filter(6)
        )

        self.train = train

        if main:
            train_datasets = [train.text, dataset2.get_train().text] \
                             if dataset2 else [train]
            self.TEXT.build_vocab(*train_datasets, vectors=FastText('en'))
            self.LABEL.build_vocab(train)

            self.n_vocab = len(self.TEXT.vocab.itos)
            self.emb_dim = emb_dim

            self.train_iter, self.val_iter, _ = data.BucketIterator.splits(
                (train, val, test), batch_size=mbsize, device=-1, shuffle=True,
                repeat=True
            )

            self.train_iter = iter(self.train_iter)
            self.val_iter = iter(self.val_iter)
def benchmark_experimental_vectors():
    def _run_benchmark(tokens, vector):
        t0 = time.monotonic()
        for token in tokens:
            vector[token]
        print("Time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    # existing FastText
    fast_text = FastText()

    print("FastText - Not Jit Mode")
    _run_benchmark(tokens, fast_text)

    # experimental FastText
    fast_text_experimental = FastTextExperimental()
    jit_fast_text_experimental = torch.jit.script(fast_text_experimental)

    print("FastText Experimental - Not Jit Mode")
    _run_benchmark(tokens, fast_text_experimental)
    print("FastText Experimental - Jit Mode")
    _run_benchmark(tokens, jit_fast_text_experimental)
Beispiel #6
0
def get_text_metadata():
    """
        Returns word embeddings for glove/fasttext text embeddings, None for use model
    """
    if embed_type == 'use':
        return None, None, None
    text_field = data.Field(sequential=True,
                            use_vocab=True,
                            tokenize=tokenize,
                            lower=True)
    captions = get_caption_list()
    preprocessed_caption = pd.DataFrame(captions, columns=[
        'caption'
    ])['caption'].apply(lambda x: text_field.preprocess(x))
    if embed_type == 'glove':
        text_field.build_vocab(preprocessed_caption,
                               vectors=GloVe(name='6B', dim=300))
    elif embed_type == 'fasttext':
        text_field.build_vocab(preprocessed_caption,
                               vectors=FastText(language='en'))
    word_embeddings = text_field.vocab.vectors
    vocab_size = len(text_field.vocab)
    print("Length of Text Vocabulary: " + str(vocab_size))
    print("Unique Word Vectors",
          torch.unique(text_field.vocab.vectors, dim=0).shape)
    print("Vector size of Text Vocabulary: ", word_embeddings.size())
    return text_field, word_embeddings, vocab_size
Beispiel #7
0
def load_fasttext_embedding(_log):
    _log.info('Loading fasttext pretrained embedding')
    ft = FastText(language='id',
                  cache=os.path.join(os.getenv('HOME'), '.vectors_cache'))
    _log.info('Read %d pretrained words with embedding size of %d',
              len(ft.itos), ft.dim)
    return ft
    def test_vocab_extend(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            f = FastText(language='simple')
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=f)
            n_vocab = len(v)
            v.extend(f)  # extend the vocab with the words contained in f.itos
            self.assertGreater(len(v), n_vocab)

            self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)
Beispiel #9
0
def load_word_vectors(word_vectors_name, embedding_size, word_vectors_cache='../data/word_vectors_cache'):

    implemented_vector_embeddings = ('GloVe_6B', 'GloVe_42B', 'GloVe_840B', 'GloVe_twitter.27B', 'FastText_en')
    assert word_vectors_name in implemented_vector_embeddings

    word_vectors = None

    if word_vectors_name == 'GloVe_6B':
        assert embedding_size in (50, 100, 200, 300)
        word_vectors = GloVe(name='6B', dim=embedding_size, cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_42B':
        embedding_size = 300
        word_vectors = GloVe(name='42B', cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_840B':
        embedding_size = 300
        word_vectors = GloVe(name='840B', cache=word_vectors_cache)

    if word_vectors_name == 'GloVe_twitter.27B':
        assert embedding_size in (25, 50, 100, 200)
        word_vectors = GloVe(name='twitter.27B', dim=embedding_size, cache=word_vectors_cache)

    if word_vectors_name == 'FastText_en':
        embedding_size = 300
        word_vectors = FastText(language='en', cache=word_vectors_cache)

    return word_vectors, embedding_size
Beispiel #10
0
def get_all_vectors(pretrained_model):
    emb_vectors = []

    if pretrained_model == "":
        return emb_vectors

    emb_vector_names = pretrained_model.split(",")
    for emb_vector_name in emb_vector_names:
        emb_info = emb_vector_name.split("_")
        if len(emb_info) == 3:
            emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2]
        else:
            emb_name, emb_set = emb_info[0], emb_info[1]

        if emb_name == "glove":  # glove_640B_300
            print("glove")
            emb_vectors.append(GloVe(name=emb_set, dim=emb_size))
        elif emb_name == "fasttext":
            if emb_set == "subwordcc":  # fasttext_subwordcc
                print("fasttext_subwordcc")
                emb_vectors.append(FastTextSubwordCC())
            elif emb_set == "wiki":  # fasttext_wiki_en
                print("fasttext_wiki")
                emb_vectors.append(FastText(language=emb_size))
            elif emb_set == "cc":  # fasttext_cc_en
                print("fasttext_cc")
                emb_vectors.append(FastTextCC(language=emb_size))
        elif emb_name == "char":  # char_ngram
            if emb_set == "ngram":
                print("char_ngram")
                emb_vectors.append(CharNGram())
    return emb_vectors
Beispiel #11
0
def benchmark_experimental_vectors():
    def _run_benchmark_lookup(tokens, vector):
        t0 = time.monotonic()
        for token in tokens:
            vector[token]
        print("Lookup time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    # existing FastText construction
    print("Existing FastText - Not Jit Mode")
    t0 = time.monotonic()
    fast_text = FastText()
    print("Construction time:", time.monotonic() - t0)
    _run_benchmark_lookup(tokens, fast_text)

    # experimental FastText construction
    print("FastText Experimental")
    t0 = time.monotonic()
    fast_text_experimental = FastTextExperimental(validate_file=False)
    print("Construction time:", time.monotonic() - t0)

    # not jit lookup
    print("FastText Experimental - Not Jit Mode")
    _run_benchmark_lookup(tokens, fast_text_experimental)

    # jit lookup
    print("FastText Experimental - Jit Mode")
    jit_fast_text_experimental = torch.jit.script(fast_text_experimental)
    _run_benchmark_lookup(tokens, jit_fast_text_experimental)
Beispiel #12
0
def main(params):
    # build dataset
    train_data = pd.read_csv('./data/train_final.csv')
    tokenizer = get_tokenizer('spacy', language='en')

    if params.emb_type == "GloVe":
        embedding = GloVe(
            name=params.emb_data, dim=params.emb_dim
        )  # use glove embedding with default option(name='840B', dim=300)
    elif params.emb_type == "CharNGram":
        embedding = CharNGram()
    elif params.emb_type == "FastText":
        embedding = FastText(name=params.emb_data, dim=params.emb_dim)
    else:
        print("Wrong embedding type")
        exit()

    train_data, val_data = train_data[1000:], train_data[:1000]
    train_dataset = SentimentDataset(train_data, tokenizer, embedding)
    val_dataset = SentimentDataset(val_data, tokenizer, embedding)

    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    model = SentimentClassificationModel(params.emb_dim, params.hidden_dim,
                                         params.dropout).to(device)
    crit = nn.CrossEntropyLoss().to(device)
    optim = torch.optim.Adam(params=model.parameters(), lr=1e-3)

    best_val_acc = 0
    early_stop_cnt = 0
    epoch = 0
    train_loss_list = []
    train_acc_list = []
    val_acc_list = []
    while early_stop_cnt != 5:
        loss_list, train_acc = train.trainer(epoch, model, train_dataloader,
                                             crit, optim, device)
        val_acc = train.eval(epoch, model, val_dataloader, device, False)
        if val_acc > best_val_acc and epoch > 0:
            torch.save(model.state_dict(), './model/lstm_best.pt')
            best_val_acc = val_acc
            early_stop_cnt = 0

        early_stop_cnt += 1
        epoch += 1
        train_loss_list.extend(loss_list)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)

    print("Early stopping condition satisfied")
    plotting("train_loss", "steps", "loss", train_loss_list)
    plotting("train_accuracy", "epoch", "accuracy", train_acc_list)
    plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
Beispiel #13
0
 def __init__(self,
              df: pd.DataFrame,
              preprocess: bool = True,
              translation_dict: Optional[Dict[str, str]] = None):
     index: List[PIDTitleRecord] = []
     for _, row in df.iterrows():
         title = preprocess_title(
             row['title'], translation_dict) if preprocess else row['title']
         index.append(PIDTitleRecord(pid=row['posting_id'], title=title))
     self._index = index
     self._vocab = FastText()
Beispiel #14
0
 def __init__(self,
              df: pd.DataFrame,
              preprocess: bool = True,
              translation_dict: Optional[Dict[str, str]] = None):
     index: List[TitleLabelRecord] = []
     for _, row in df.iterrows():
         title = preprocess_title(
             row['title'], translation_dict) if preprocess else row['title']
         index.append(
             TitleLabelRecord(title=title, label_group=row['label_group']))
     self._index = index
     self._vocab = FastText()
Beispiel #15
0
    def __init__(self, batch_size=128):
        self.batch_size = batch_size
        self.TEXT = data.Field()
        self.LABEL = data.Field(sequential=False)

        self.train, self.val, self.test = datasets.SST.splits(
            self.TEXT, self.LABEL, fine_grained=True, train_subtrees=True)

        f = FastText()
        self.TEXT.build_vocab(self.train, vectors=f)
        self.TEXT.vocab.extend(f)
        self.LABEL.build_vocab(self.train)
Beispiel #16
0
    def __init__(self, args):
        if args.datastories:
            tokenizer = SocialTokenizer(lowercase=True)
        else:
            tokenizer = TweetTokenizer()
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True,
                               include_lengths=True,
                               lower=True,
                               tokenize=tokenizer.tokenize)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = datasets.EMO.splits(
            args, self.RAW, self.TEXT, self.LABEL, args.train_data_path,
            args.valid_data_path, args.test_data_path)

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='840B', dim=300))

        if args.fasttext:
            self.FASTTEXT = data.Field(batch_first=True,
                                       include_lengths=True,
                                       lower=True,
                                       tokenize=tokenizer.tokenize)
            self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab)
            self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi,
                                            vectors=FastText(language='en'),
                                            dim=300)
        self.LABEL.build_vocab(self.train)

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_size=args.batch_size,
                                       device=args.device,
                                       repeat=False)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.char_emb:
            self.build_char_vocab()

        filehandler = open('./data/vocab.obj', 'wb')
        pickle.dump(self.TEXT.vocab, filehandler)
        filehandler = open('./data/label.obj', 'wb')
        pickle.dump(self.LABEL.vocab, filehandler)
Beispiel #17
0
    def test_vocab_download_fasttext_vectors(self):
        c = Counter({
            'hello': 4,
            'world': 3,
            'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
            'freq_too_low': 2
        })
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = str("fasttext.simple.300d")  # must handle str on Py2
            else:
                vectors = FastText(language='simple')

            v = vocab.Vocab(c,
                            min_freq=3,
                            specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)

            expected_itos = [
                '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'
            ]
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache",
                                    "wiki.simple.vec")
            conditional_remove(vec_file)
Beispiel #18
0
 def process_text(self, text):
     """Transform each description into vectors
     """
     # filter text
     text = text.apply(lambda doc: self.filter_text(doc))
     tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
     # get idf (inverse document frequency)
     print('Calculating tf-idf...')
     warnings.filterwarnings("ignore")
     tfidf = TfidfVectorizer(tokenizer=tokenizer)
     tfidf.fit(text.dropna())
     idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
     print('Converting text to document embedding...')
     # get document embedding
     w2v = FastText(language='en')
     self.text_dim = w2v.dim
     text = text.apply(lambda doc: self.doc2vec(doc, tokenizer, idf, w2v))
     return text
Beispiel #19
0
def load_embedding(embed_corpus):
    corpora = [
        'glove_twitter', 'glove_commoncrawl', 'fasttext_wiki',
        'fasttext_commoncrawl', 'word2vec'
    ]
    dim = 300

    os.makedirs('data/glove', exist_ok=True)
    os.makedirs('data/fast_text', exist_ok=True)
    os.makedirs('data/word2vec', exist_ok=True)

    if embed_corpus == 'glove_twitter':
        # GloVe trained on Twitter corpus
        embedding = GloVe(name='twitter.27B', dim=200, cache='data/glove/')
        dim = 200
    elif embed_corpus == 'glove_commoncrawl':
        # GloVe trained on Common Crawl corpus
        embedding = GloVe(name='42B', dim=300, cache='data/glove/')
    elif embed_corpus == 'fasttext_wiki':
        # FastText trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset
        embedding = FastText(language='en', cache='data/fast_text/')
    elif embed_corpus == 'fasttext_commoncrawl':
        # FastText trained on Common Crawl corpus
        embedding = Vectors(
            name='crawl-300d-2M.vec',
            url=
            'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip',
            cache='data/fast_text/')
    elif embed_corpus == 'word2vec':
        # Word2Vec trained on Google New corpus
        name = 'GoogleNews-vectors-negative300.txt'
        if os.path.isfile(f'data/word2vec/{name}.pt'):
            embedding = Vectors(name=name, cache='data/word2vec/')
        else:
            raise FileNotFoundError((
                'No torchtext formatted word2vec vectors file found. '
                'See load_word2vec.py to create the necessary pt file. Requires gensim.'
            ))
    else:
        raise ValueError(
            f'Invalid pre-trained word embedding vectors. Options are {"/".join(corpora)}.'
        )

    return embedding, dim
Beispiel #20
0
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        print("loading vectors")
        if vectors_name == "fasttext":
            vectors = FastText()
        elif "B" in vectors_name:
            vectors = GloVe(vectors_name)
        elif vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        print("completed vectors loading")
        train, val, test = cls.splits(path)

        cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors)
        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
    def create_embedding_matrix(self):
        """ currently only supports fasttext.
        Returns the weight matrix for the current vocab"""
        import torch
        import torch.nn as nn
        target_vocab = self.label_encoder.classes_
        embedding = FastText('en')
        emb_dim = int(embedding.dim)
        matrix_len = len(target_vocab)
        weights_matrix = np.zeros((matrix_len, emb_dim))
        words_found = 0
        for i, word in enumerate(target_vocab):
            try:
                weights_matrix[i] = embedding[word]
                words_found += 1
            except KeyError:
                weights_matrix[i] = np.random.normal(scale=0.6,
                                                     size=(emb_dim, ))

        weights_matrix = torch.from_numpy(weights_matrix)

        return weights_matrix
Beispiel #22
0
def get_vectors(model_name, emb_folder):
    if model_name not in available_models:
        raise AttributeError(
            f'Model name {model_name} is not in model list: {available_models}'
        )

    if not os.path.exists(emb_folder):
        os.mkdir(emb_folder)

    model_type = available_models[model_name]
    if model_type == 'fasttext':
        lang = model_name.split('_')[0]
        fasttext_emb_folder = os.path.join(os.getcwd(), emb_folder, 'fasttext')
        if not os.path.exists(fasttext_emb_folder):
            os.mkdir(fasttext_emb_folder)
        vectors = FastText(language=lang, cache=fasttext_emb_folder)
        fasttext_model_name = os.path.join(fasttext_emb_folder,
                                           f'wiki.{lang}.vec')
        os.remove(fasttext_model_name)
        return vectors
    elif model_type == 'gensim':
        glove_emb_folder = os.path.join(os.getcwd(), emb_folder, 'glove')
        if not os.path.exists(glove_emb_folder):
            os.mkdir(glove_emb_folder)
        api.BASE_DIR = glove_emb_folder

        raw_model_name = model_name.split('_')[1]
        w2v_model_name = raw_model_name + '.txt'
        full_w2v_model_name = os.path.join(glove_emb_folder, w2v_model_name)
        if not os.path.exists(full_w2v_model_name + '.pt'):
            model_gensim = api.load(raw_model_name)
            model_gensim.save_word2vec_format(full_w2v_model_name)
            shutil.rmtree(os.path.join(glove_emb_folder, raw_model_name))
            vectors = Vectors(w2v_model_name, cache=glove_emb_folder)
            os.remove(full_w2v_model_name)
        else:
            vectors = Vectors(w2v_model_name, cache=glove_emb_folder)
        return vectors
Beispiel #23
0
    def __init__(self,
                 df: pd.DataFrame,
                 preprocess: bool = True,
                 translation_dict: Optional[Dict[str, str]] = None):
        index: List[TitleTripletRecord] = []

        for _, row in df.iterrows():
            title_a = preprocess_title(
                row['title_a'],
                translation_dict) if preprocess else row['title_a']
            title_p = preprocess_title(
                row['title_p'],
                translation_dict) if preprocess else row['title_p']
            title_n = preprocess_title(
                row['title_n'],
                translation_dict) if preprocess else row['title_n']
            index.append(
                TitleTripletRecord(title_a=title_a,
                                   title_p=title_p,
                                   title_n=title_n))

        self._index = index
        self._vocab = FastText()
Beispiel #24
0
    def __init__(self,
                 max_len,
                 batch_size,
                 max_epochs,
                 device,
                 unsup_proportion,
                 sup_proportion,
                 dev_index=1,
                 pretrained=False):
        text_field = data.Field(
            lower=True,
            batch_first=True,
            fix_length=max_len,
            pad_token='<pad>',
            init_token='<go>',
            is_target=True
        )  #init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>')
        label_field = data.Field(fix_length=max_len - 1, batch_first=True)

        # make splits for data
        #unsup_train, unsup_val, unsup_test = MyPennTreebank.splits(text_field)
        #unsup_train, unsup_val, unsup_test = datasets.PennTreebank.splits(text_field)
        #unsup_train, unsup_val, unsup_test = datasets.WikiText2.splits(text_field)
        unsup_train, unsup_val, unsup_test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))
        #unsup_train, unsup_val, unsup_test = YahooLM.splits(text_field)
        train, val, test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))

        # build the vocabulary
        text_field.build_vocab(
            unsup_train,
            max_size=VOCAB_LIMIT)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)
        # self.train_iter, _,  _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test),
        #                                                                     batch_size=batch_size, bptt_len=max_len,
        #                                                                     device=device, repeat=False, shuffle=False,
        #                                                                     sort=False)
        # _, self.unsup_val_iter,  _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test),
        #                                                                     batch_size=int(batch_size/10), bptt_len=max_len,
        #                                                                     device=device, repeat=False, shuffle=False,
        #                                                                     sort=False)
        # Remaking splits according to supervision proportions
        exlist = [ex for ex in train + val]
        train = Dataset(exlist, {'text': text_field, 'label': label_field})
        dev_start, dev_end = int(len(train) / 5 * (dev_index - 1)), \
                             int(len(train) / 5 * (dev_index))
        train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start * sup_proportion), \
                                                             int(dev_end + (len(train) - dev_end) * sup_proportion)
        unsup_start, unsup_end = 0, int(len(unsup_train) * unsup_proportion)
        val = Dataset(train[dev_start:dev_end], {
            'text': text_field,
            'label': label_field
        })
        train = Dataset(
            train[train_start1:train_end1] + train[train_start2:train_end2], {
                'text': text_field,
                'label': label_field
            })
        unsup_train = Dataset(unsup_train[unsup_start:unsup_end],
                              {'text': text_field})

        # make iterator for splits

        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size / 10),
            device=device,
            shuffle=False,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=False,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, val, test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
Beispiel #25
0
def load_img_samples(orig_dir, dest_dir):
    for label in sorted(os.listdir(orig_dir)):
        class_path = f'{orig_dir}/{label}'
        with os.scandir(class_path) as it:
            for _, path in tqdm(enumerate(it)):
                with open(path, 'rb') as f:
                    try:
                        img = Image.open(f)
                        img = img.convert('RGB')
                        img = img.resize((384, 384))
                        if not os.path.exists(f'{dest_dir}/{label}'):
                            os.mkdir(f'{dest_dir}/{label}')
                        img.save(f'{dest_dir}/{label}/{"".join(path.name.split(".")[:-1])}.jpg', "JPEG", quality=100)
                    except UnidentifiedImageError:
                        pass


if __name__ == '__main__':
    fasttext_model = FastText()
    glove_model = GloVe()
    load_img_samples('../data/original/Tobacco3482-jpg',
                     '../data/Tobacco3482-jpg')
    load_txt_samples('../data/original/QS-OCR-small',
                     '../data/QS-OCR-small', fasttext_model)
    for s in ['val', 'test', 'train']:
        load_img_samples(f'../data/original/RVL-CDIP/{s}',
                         f'../data/RVL-CDIP/{s}')
        load_txt_samples(f'../data/original/QS-OCR-Large/{s}',
                         f'../data/QS-OCR-Large/{s}', fasttext_model)
Beispiel #26
0
def build_legacy_fasttext_vector_pipeline():
    tokenizer = get_tokenizer("basic_english")
    vector = FastText()

    pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens)
    return pipeline, None, None
Beispiel #27
0
    def __init__(self, glove=True, device=device):
        self.device = device

        nlp = spacy.load("en_core_web_sm")

        char_nesting = Field(batch_first=True, tokenize=list, lower=True)
        char = NestedField(char_nesting,
                           init_token="<sos>",
                           eos_token="<eos>",
                           tokenize="spacy")
        word = Field(init_token="<sos>",
                     eos_token="<eos>",
                     lower=True,
                     tokenize="spacy")
        label = Field(sequential=False, is_target=True, use_vocab=False)

        self.fields = [("question_char", char), ("question_word", word),
                       ("context_char", char), ("context_word", word),
                       ("answer", label)]

        self.dict_fields = {
            "question": [("question_char", char), ("question_word", word)],
            "context": [("context_char", char), ("context_word", word)],
            "answer": ("answer", label)
        }

        self.train_data = self._get_data("../data/train.jsonl")
        self.dev_data = self._get_data("../data/dev.jsonl")

        char.build_vocab(self.train_data)
        if glove:
            word.build_vocab(self.train_data,
                             vectors=GloVe(name="6B", dim=100))
        else:
            word.build_vocab(self.train_data,
                             vectors=FastText(language='en',
                                              max_vectors=30000))

        self.char_vocab = char.vocab
        self.word_vocab = word.vocab

        pos = []
        ner = []

        ind2pos = []
        ind2ner = []

        for data in tqdm(self.train_data):
            doc = nlp(' '.join(data.question_word + data.context_word))

            # t - token
            pos.extend([t.pos_ for t in doc])
            ner.extend([t.label_ for t in doc.ents])

            ind2pos.extend([[self.word_vocab.stoi[str(t)], t.pos_]
                            for t in doc])
            ind2ner.extend([[self.word_vocab.stoi[str(t)], t.label_]
                            for t in doc.ents])

        self.pos_voc = {tag: i for i, tag in enumerate(set(pos))}
        self.ner_voc = {tag: i + 1 for i, tag in enumerate(set(ner))}
        self.ner_voc['None'] = 0

        # default values, used in DrQA model
        self.ind2pos = defaultdict(lambda: self.pos_voc['X'])  # returns 14
        self.ind2ner = defaultdict(lambda: self.ner_voc['None'])  # returns 0

        self.ind2pos.update({tag[0]: self.pos_voc[tag[1]] for tag in ind2pos})
        self.ind2ner.update({tag[0]: self.ner_voc[tag[1]] for tag in ind2ner})
Beispiel #28
0
FILE = data.LabelField(sequential=False)

#データの読み込み
dataset = data.TabularDataset(path='./document.tsv',
                              format='tsv',
                              fields=[('Text', TEXT), ('Label', LABEL),
                                      ('File', FILE)],
                              skip_header=True)

LABEL.build_vocab(dataset)
FILE.build_vocab(dataset)

train, val, test = dataset.split(split_ratio=[0.7, 0.1, 0.2],
                                 random_state=random.getstate())

TEXT.build_vocab(train, vectors=FastText(language="ja"), min_freq=2)

#size
print(TEXT.vocab.vectors.size())

# device = torch.device('cpu')
device = torch.device('cuda:0')
train_iter, val_iter, test_iter = data.Iterator.splits((train, val, test),
                                                       batch_sizes=(16, 16, 1),
                                                       device=device,
                                                       repeat=False,
                                                       sort=False)

batch = next(iter(train_iter))
print(batch.Text)
print(batch.Label)
Beispiel #29
0
def main(language, hidden_dim, dropout, proc, letter_proc, objective, operator,
         alpha, lr, momentum, optimizer, batch_size, n_epochs,
         pretrained_embeddings, letter_hidden_dim, letter_embedding_dim,
         n_samples, pad_edge, augment, _seed, _run, _log):
    if objective not in ['erm', 'nll']:
        raise ValueError("`objective` should be in ['erm', 'nll'],"
                         "got %s" % objective)

    # Technical
    device = init_system()

    if pad_edge:
        init_token = '<init>'
        eos_token = '<end>'
    else:
        init_token = None
        eos_token = None
    # Data loading using torchtext abstraction
    tags = ttdata.Field(sequential=True,
                        include_lengths=True,
                        preprocessing=iob1_iobes,
                        init_token=init_token,
                        eos_token=eos_token,
                        pad_token=None,
                        unk_token=None,
                        batch_first=True)
    sentences = ttdata.Field(sequential=True,
                             include_lengths=False,
                             batch_first=True,
                             init_token=init_token,
                             eos_token=eos_token,
                             preprocessing=zero_num)
    letter = ttdata.Field(sequential=True,
                          tokenize=list,
                          include_lengths=True,
                          init_token=None,
                          eos_token=None,
                          preprocessing=zero_num,
                          batch_first=True)
    letters = NestedField(
        letter,
        use_vocab=True,
        tensor_type=torch.FloatTensor,
        init_token=init_token,
        eos_token=eos_token,
    )

    if language == 'en':
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('', None), ('tags', tags)]
    elif language == 'de':
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('', None), ('', None), ('tags', tags)]
    elif language in ['es', 'nl']:
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('tags', tags)]
    else:
        raise ValueError('Wrong language')

    tagger_languages = {'en': 'eng', 'nl': 'ned', 'de': 'deu', 'es': 'esp'}

    train_data, val_data, test_data = SequenceTaggingDataset.splits(
        path=expanduser('~/data/sdtw_data/conll'),
        train='%s.train' % tagger_languages[language],
        validation='%s.testa' % tagger_languages[language],
        test='%s.testb' % tagger_languages[language],
        n_samples=n_samples,
        fields=fields)

    letters.build_vocab(train_data, val_data, test_data)
    tags.build_vocab(train_data)
    tag_itos = tags.vocab.itos
    if pad_edge:
        eos_idx = tags.vocab.stoi[tags.eos_token]
        init_idx = tags.vocab.stoi[tags.init_token]
        tag_itos[eos_idx] = 'O'
        tag_itos[init_idx] = 'O'
    else:
        eos_idx = None
        init_idx = None

    if isinstance(pretrained_embeddings, int):
        sentences.build_vocab(train_data, val_data, test_data)
        embedding_dim = pretrained_embeddings
    else:
        if pretrained_embeddings == 'ner':
            vectors = CaseInsensitiveVectors(
                expanduser('~/data/sdtw_data/ner/%s' %
                           tagger_languages[language]),
                unk_init=lambda x: x.normal_(0, 1),
                cache=expanduser('~/cache'))
        elif 'glove' in pretrained_embeddings:
            _, name, dim = pretrained_embeddings.split('.')
            dim = dim[:-1]
            GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__
            vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache'))
        elif pretrained_embeddings == 'fasttext':
            FastText.__getitem__ = CaseInsensitiveVectors.__getitem__
            FastText.cache = CaseInsensitiveVectors.cache
            vectors = FastText(language=language, cache=expanduser('~/cache'))
        # extend vocab with words of test/val set that has embeddings in
        # pre-trained embedding
        # A prod-version would do it dynamically at inference time
        counter = Counter()
        sentences.build_vocab(val_data, test_data)
        for word in sentences.vocab.stoi:
            if word in vectors.stoi or word.lower() in vectors.stoi or \
                    re.sub('\d', '0', word.lower()) in vectors.stoi:
                counter[word] = 1
        eval_vocab = Vocab(counter)
        print("%i/%i eval/test word in pretrained" %
              (len(counter), len(sentences.vocab.stoi)))
        sentences.build_vocab(train_data)
        prev_vocab_size = len(sentences.vocab.stoi)
        sentences.vocab.extend(eval_vocab)
        new_vocab_size = len(sentences.vocab.stoi)
        print('New vocab size: %i (was %i)' %
              (new_vocab_size, prev_vocab_size))
        sentences.vocab.load_vectors(vectors)
        embedding_dim = sentences.vocab.vectors.shape[1]
    artifact_dir = _run.info['artifact_dir']
    vocab_dict = {
        'sentences': sentences.vocab,
        'tags': tags.vocab,
        'letters': letter.vocab
    }
    torch.save(vocab_dict, open(join(artifact_dir, 'vocab.pt'), 'wb+'))

    unk_idx = sentences.vocab.stoi[sentences.unk_token]
    padding_idx = sentences.vocab.stoi[sentences.pad_token]
    singleton_idx = [
        tags.vocab.stoi[singleton] for singleton in tags.vocab.stoi
        if 'S-' in singleton
    ]
    tagset_size = len(tags.vocab)
    vocab_size = len(sentences.vocab)
    letter_size = len(letters.vocab)

    device_iter = -1 if device.type == 'cpu' else device.index
    train_iter, val_iter, test_iter = Iterator.splits(
        (train_data, val_data, test_data),
        sort_within_batch=True,
        batch_sizes=(batch_size, 512, 512),
        device=device_iter)
    train_test_iter = Iterator(train_data,
                               sort_within_batch=True,
                               batch_size=512,
                               shuffle=True,
                               device=device_iter)
    eval_iter = {
        'val': val_iter,
        'test': test_iter,
        'train_test': [next(iter(train_test_iter))]
    }

    model = Tagger(embedding_dim,
                   vocab_size,
                   tagset_size,
                   hidden_dim=hidden_dim,
                   proc=proc,
                   padding_idx=padding_idx,
                   letter_proc=letter_proc,
                   letter_embedding_dim=letter_embedding_dim,
                   letter_hidden_dim=letter_hidden_dim,
                   letter_size=letter_size,
                   dropout=dropout,
                   eos_idx=eos_idx,
                   init_idx=init_idx,
                   alpha=alpha,
                   operator=operator)

    # Load vectors
    if hasattr(sentences.vocab, 'vectors'):
        model.embedder.word_embeddings.weight.data = sentences.vocab.vectors
        model.embedder.word_embeddings.weight.data[padding_idx].fill_(0.)

    model = model.to(device=device)

    if operator == 'softmax':
        loss_function = OurNLLLoss()
    else:
        loss_function = BinaryMSELoss()

    score_function = functools.partial(ner_score,
                                       tag_itos=tag_itos,
                                       format='iobes')

    if optimizer == 'sgd':
        optimizer = torch.optim.SGD(params=model.parameters(),
                                    lr=lr * batch_size,
                                    momentum=momentum)
    elif optimizer == 'adam':
        optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    else:
        raise ValueError()
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='min',
                                  factor=0.5,
                                  patience=5,
                                  threshold=1e-3,
                                  cooldown=2)

    for fold in eval_iter:
        _run.info['%s_loss' % fold] = []
        _run.info['%s_prec' % fold] = []
        _run.info['%s_recall' % fold] = []
        _run.info['%s_f1' % fold] = []
    _run.info['epochs'] = []
    _run.info['time'] = []
    last_epoch = floor(train_iter.epoch)
    t0 = time.clock()
    total_time = 0

    for batch in train_iter:
        epoch = floor(train_iter.epoch)
        if epoch > last_epoch:
            t1 = time.clock()
            elapsed = t1 - t0
            total_time += elapsed
            model.eval()
            _log.info("epoch %i, time/epoch %.3f s" % (epoch, elapsed))
            if epoch % 10 == 0:
                dump_model(model, 'model_%i.pt' % epoch)
            for fold in eval_iter:
                this_iter = eval_iter[fold]
                this_iter = iter(this_iter)
                loss, prec, recall, f1 = validate(model, this_iter,
                                                  score_function, objective,
                                                  loss_function)
                if fold == 'val':
                    scheduler.step(loss.item(), epoch=epoch)
                _log.info("%s: loss %.4f, prec %.4f, recall %.4f, f1 %.4f" %
                          (fold, loss, prec, recall, f1))
                _run.info['%s_loss' % fold].append(loss.item())
                _run.info['%s_prec' % fold].append(prec)
                _run.info['%s_recall' % fold].append(recall)
                _run.info['%s_f1' % fold].append(f1)
            _run.info['time'].append(total_time)
            _run.info['epochs'].append(epoch)
            if epoch > n_epochs:
                break
            t0 = time.clock()
        data = make_data(batch,
                         augment=augment,
                         unk_idx=unk_idx,
                         singleton_idx=singleton_idx)
        model.train()
        model.zero_grad()
        loss = compute_loss(model, data, objective, loss_function)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5, norm_type=2)
        optimizer.step()
        last_epoch = epoch
    dump_model(model, 'model_final.pt')

    return _run.info['test_f1'][-1]
Beispiel #30
0
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=3)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
TEXT.build_vocab(
    train, vectors=[GloVe(name='840B', dim='300'),
                    CharNGram(),
                    FastText()])
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 3:
f = FastText()