Beispiel #1
0
def get_pbg(qids, emb_path, emb_file):
    if os.path.isfile(os.path.join(emb_path, emb_file + '.pt')):
        return Vectors(name=emb_file, cache=emb_path)

    # create temporary file with only the required embeddings and return
    with open(os.path.join(emb_path, emb_file), 'w',
              encoding='utf-8') as new_file:
        pbg_file = open(os.path.join(emb_path,
                                     'wikidata_embeddings_tranlation_v1.tsv'),
                        'r',
                        encoding='utf-8')
        index_file = open(os.path.join(emb_path,
                                       'wikidata_qcodes_pointers_v1.json'),
                          'r',
                          encoding='utf-8')
        index = json.load(index_file)
        found = 0
        for qid in qids:
            pos = index.get(qid, None)
            if pos:
                found += 1
                pbg_file.seek(pos)
                line = ' '.join(pbg_file.readline().strip().split('\t'))
                new_file.write(line + '\n')
        index_file.close()
        pbg_file.close()
    print('Created smaller PBG embedding file with {}/{} QIDs...'.format(
        found, len(qids)))
    return Vectors(name=emb_file, cache=emb_path)
Beispiel #2
0
    def iters(cls,
              path,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors_path=None):
        assert vectors_path is not None, print(
            "should generate initial vectors first, by: "
            "python get_save_vectors.py")
        train, val, test = cls.splits(path)
        if isinstance(cls.TEXT_FIELD, NestedField):
            cls.TEXT_FIELD.nesting_field.vocab = Vectors(
                name='digital_text_200', cache=vectors_path)
        else:
            cls.TEXT_FIELD.vocab = Vectors(name='digital_text',
                                           cache=vectors_path)
        cls.USR_FIELD.vocab = Vectors(name='digital_usr', cache=vectors_path)
        cls.PRD_FIELD.vocab = Vectors(name='digital_prd', cache=vectors_path)

        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
    def __init__(self, model_path: str, train_path: str, wordemb_path: str,
                 charemb_path: str, hidden_size: int):
        """
        :param model_path: trained model file path (.pth)
        :param train_path: file path used training
        :param wordemb_path: path of word embedding used training
        :param charemb_path: path of char embedding used training
        :param hidden_size: size of hidden layer
        """

        self.mecab = MeCab.Tagger('-Owakati')
        self.WORD = data.Field(batch_first=True)
        self.CHAR = data.Field(batch_first=True)
        self.LABEL = data.Field(batch_first=True)
        self.fields = [('char', self.CHAR), ('word', self.WORD),
                       ('label', self.LABEL)]
        self.dataset = datasets.SequenceTaggingDataset(path=train_path,
                                                       fields=self.fields,
                                                       separator='\t')
        self.CHAR.build_vocab(self.dataset, vectors=Vectors(charemb_path))
        self.WORD.build_vocab(self.dataset, vectors=Vectors(wordemb_path))
        self.LABEL.build_vocab(self.dataset)
        self.model = BLSTMCRF(len(self.LABEL.vocab.itos), hidden_size, 0.0,
                              self.WORD.vocab.vectors.size()[1],
                              self.CHAR.vocab.vectors.size()[1])
        self.model.load(model_path)
Beispiel #4
0
    def __init__(self, text_path: str, wordemb_path: str, charemb_path: str,
                 device: str):
        """
        The form of dataset
        想定しているデータセットの形
        私は白い恋人を食べました
        私  私  O
        は  は  O
        白  白い    B-PRO
        い  白い    I-PRO
        恋  恋人    I-PRO
        人  恋人    I-PRO
        を  を  O
        食  食べ    O
        べ  食べ    O
        ま  まし    O
        し  まし    O
        た  た  O
        """

        self.WORD = data.Field(batch_first=True)
        self.CHAR = data.Field(batch_first=True)
        self.LABEL = data.Field(batch_first=True)
        self.fields = [('char', self.CHAR), ('word', self.WORD),
                       ('label', self.LABEL)]
        self.dataset = datasets.SequenceTaggingDataset(path=text_path,
                                                       fields=self.fields,
                                                       separator='\t')
        self.CHAR.build_vocab(self.dataset, vectors=Vectors(charemb_path))
        self.WORD.build_vocab(self.dataset, vectors=Vectors(wordemb_path))
        self.LABEL.build_vocab(self.dataset)
        self.device = device
    def get_vectors(self, vocab):
        sources = None
        if self.wordvec_source == 'glove':
            sources = ['GloVe']
        elif self.wordvec_source == 'charlevel':
            sources = ['GloVe', 'charLevel']
        elif self.wordvec_source == 'google':
            sources = ['googlenews']
        elif self.wordvec_source == 'gigavec':
            sources = ['gigavec']
        else:
            sources = []

        print('Building Vocab...')

        if isinstance(vocab, Vocab):
            print("Using Pretrained Vocab")
            self.sentence_field.vocab = vocab
            print(len(self.sentence_field.vocab.itos))
        else:
            print('wrong')
            vecs = []
            print('Loading Vectors From Memory...')
            if self.pretrained_vecs:
                print('Using these vectors: ' + str(self.wordvec_source))
                for source in sources:
                    if source == 'GloVe':
                        glove = Vectors(name='glove.6B.{}d.txt'.format(
                            self.glove_dim),
                                        cache=self.vector_cache)
                        vecs.append(glove)
                        self.wordvec_dim += self.glove_dim
                    if source == 'charLevel':
                        charVec = CharNGram()
                        self.wordvec_dim += 100
                    if source == 'googlenews':
                        googlenews = Vectors(name = 'googlenews.txt',\
                                cache = self.vector_cache)
                        vecs.append(googlenews)
                        self.wordvec_dim += 300
                    if source == 'gigavec':
                        gigavec = Vectors(name = 'gigamodel.vec',\
                                cache = self.vector_cache)
                        vecs.append(gigavec)
                        self.wordvec_dim += 300

            if isinstance(vocab, Counter):
                self.sentence_field.vocab = Vocab(vocab,
                                                  vectors=vecs,
                                                  max_size=self.max_vocab)
            else:
                self.sentence_field.build_vocab(self.train_sentences, vectors = vecs, \
                        max_size = self.max_vocab, min_freq = MIN_FREQ)
                print('Found {} tokens'.format(len(self.sentence_field.vocab)))

        if self.tie_weights:
            self.hidden_size = self.wordvec_dim
def load_data(args):

    # First you define Fields, which are 'Columns', which have your pre-processing baked in.
    # Then you call Dataset.split which splits your raw data into train/val/test datasets.
    # Then you call Field.build_vocab, which as its name suggests, builds the vocabulary from the train data; it builds a dictionary mapping words to ids (including SOS and EOS) and puts it inside each Field object.

    SRC = data.Field(tokenize=tokenize,
                     init_token='SOS',
                     eos_token='EOS',
                     include_lengths=True,
                     fix_length=args.max_sentence_length)

    TRG = data.Field(tokenize=tokenize,
                     init_token='SOS',
                     eos_token='EOS',
                     lower=True,
                     include_lengths=True,
                     fix_length=args.max_sentence_length)

    train, val, test = TranslationDataset.splits(path=args.data,
                                                 train=args.train_prefix,
                                                 validation=args.val_prefix,
                                                 test=args.test_prefix,
                                                 exts=(args.src_ext,
                                                       args.trg_ext),
                                                 fields=(SRC, TRG))

    if hasattr(args, 'fasttext'):
        src_vecs = Vectors(name=args.fasttext_src_dir,
                           max_vectors=args.max_vocab_size)
        trg_vecs = Vectors(name=args.fasttext_trg_dir,
                           max_vectors=args.max_vocab_size)

        SRC.build_vocab(train.src,
                        min_freq=args.min_freq,
                        max_size=args.max_vocab_size)
        TRG.build_vocab(train.trg,
                        min_freq=args.min_freq,
                        max_size=args.max_vocab_size)

        SRC.vocab.set_vectors(src_vecs.stoi, src_vecs.vectors, src_vecs.dim)
        TRG.vocab.set_vectors(trg_vecs.stoi, trg_vecs.vectors, trg_vecs.dim)
    else:
        SRC.build_vocab(train.src,
                        min_freq=args.min_freq,
                        max_size=args.max_vocab_size)
        TRG.build_vocab(train.trg,
                        min_freq=args.min_freq,
                        max_size=args.max_vocab_size)

    print("most common source vocabs:", SRC.vocab.freqs.most_common(10))
    print("source vocab size:", len(SRC.vocab))
    print("most common english vocabs:", TRG.vocab.freqs.most_common(10))
    print("english vocab size:", len(TRG.vocab))

    return train, val, test, SRC, TRG
Beispiel #7
0
def create_data(data, lang):
    source_text = Field(tokenize=MosesTokenizer('en'),
                        init_token='<sos>',
                        eos_token='<eos>',
                        lower=True,
                        pad_token='<pad>',
                        unk_token='<unk>')
    target_text = Field(tokenize=MosesTokenizer(lang),
                        init_token='<sos>',
                        eos_token='<eos>',
                        lower=True,
                        pad_token='<pad>',
                        unk_token='<unk>')

    train = TranslationDataset(path=data,
                               exts=('.en', '.' + lang),
                               fields=(source_text, target_text))

    # Load the word vectors from the embedding directory
    print('Loading en word vectors')
    en_vectors = Vectors(name='cc.en.300.vec', cache=emb_dir)
    print('Loaded.')
    print('Loading {} word vectors'.format(lang))
    if lang == 'fr':
        target_vectors = Vectors(name='cc.fr.300.vec', cache=emb_dir)
    elif lang == 'de':
        target_vectors = Vectors(name='embed_tweets_de_100D_fasttext',
                                 cache=emb_dir)
    else:
        raise NotImplementedError
    print('Loaded.')

    # Build vocabulary
    print('Building en vocab')
    source_text.build_vocab(train,
                            max_size=15000,
                            min_freq=1,
                            vectors=en_vectors)
    print('Building {} vocab'.format(lang))
    target_text.build_vocab(train,
                            max_size=15000,
                            min_freq=1,
                            vectors=target_vectors)
    #source_text.build_vocab(train, min_freq = 30000, vectors="glove.6B.200d")
    #target_text.build_vocab(train, min_freq = 30000, vectors="glove.6B.200d")

    pad_idx = target_text.vocab.stoi['<pad>']
    print('pad_idx', pad_idx)
    eos_idx = target_text.vocab.stoi['<eos>']
    print('eos_idx', eos_idx)

    return train, source_text, target_text
Beispiel #8
0
    def get_splits(self, device, batch_size):
        train_dataset = torchtext.data.Dataset(self.train_examples,
                                               self.fields)
        train_dataset.sort_key = lambda example: len(example.input)

        dev_dataset = torchtext.data.Dataset(self.dev_examples, self.fields)
        dev_dataset.sort_key = lambda example: len(example.input)

        test_dataset = torchtext.data.Dataset(self.test_examples, self.fields)
        test_dataset.sort_key = lambda example: len(example.input)

        vectors = Vectors(name=WORD2VEC_EMBEDDING_FILE,
                          cache=WORD2VEC_EMBEDDING_DIR,
                          unk_init=torch.Tensor.zero_)
        self.input_field.build_vocab(train_dataset,
                                     dev_dataset,
                                     test_dataset,
                                     vectors=vectors)
        self.query_field.build_vocab(train_dataset,
                                     dev_dataset,
                                     test_dataset,
                                     vectors=vectors)

        return BucketIterator.splits(
            (train_dataset, dev_dataset, test_dataset),
            batch_size=batch_size,
            repeat=False,
            shuffle=True,
            sort_within_batch=True,
            device=device)
Beispiel #9
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    TEXT = data.Field()
    LABEL = data.Field(sequential=False, dtype=torch.long)

    train, val, test = datasets.SST.splits(TEXT,
                                           LABEL,
                                           fine_grained=True,
                                           train_subtrees=False)

    LABEL.build_vocab(train)

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_size=8, device=device)
    TEXT.build_vocab(train, vectors=Vectors(name="vector.txt", cache="./data"))
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 768
    OUTPUT_DIM = 5
    for file in os.listdir("./trained_models"):
        print(file)
        checkpoint = torch.load("./trained_models/" + file)
        print(checkpoint)
        for k in checkpoint:
            print(k)
        model = md.model(file[:-9])(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM,
                                    OUTPUT_DIM).load_state_dict(checkpoint)
        model = model.to(device)
        # Test
        test_loss, test_acc = solver.evaluate(model, test_iter, criterion)
        print(file[:-9] +
              f" Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%")
Beispiel #10
0
    def __init__(self, args):
        super(SharedLSTM, self).__init__(args)

        self.args = args
        self.hidden_dim = 300
        self.class_num = args.class_num
        self.batch_size = args.batch_size
        self.bidirectional = True
        self.num_layers = args.num_layers
        self.pad_index = args.pad_index
        self.dropout = args.dropout
        self.save_path = args.save_path

        vocabulary_size = args.vocabulary_size
        embedding_dimension = args.embedding_dim

        self.embedding = nn.Embedding(vocabulary_size,
                                      embedding_dimension).to(DEVICE)
        if args.static:
            logger.info('logging word vectors from {}'.format(
                args.vector_path))
            vectors = Vectors(args.vector_path).vectors
            self.embedding = self.embedding.from_pretrained(
                vectors, freeze=not args.non_static).to(DEVICE)

        self.lstm = nn.LSTM(embedding_dimension,
                            self.hidden_dim // 2,
                            bidirectional=self.bidirectional,
                            num_layers=self.num_layers,
                            dropout=self.dropout).to(DEVICE)
        self.dropout_layer = nn.Dropout(self.dropout).to(DEVICE)
        self.batch_norm = nn.BatchNorm1d(self.hidden_dim * 2).to(DEVICE)
        self.hidden2label = nn.Linear(self.hidden_dim * 2,
                                      self.class_num).to(DEVICE)
Beispiel #11
0
    def load_my_data(self, word_embedding_pkl, pairs_pkl):
        """
        Loads the data from file
        :param word_embedding_pkl: absolute path to word_embeddings {Glove/Word2Vec}
        :param pairs_pkl:       # pkl file save data
        context_flag            # 0: bairly include pairs
                                # 1: include pairs and local context
                                # 2: include pairs and global context
                                # 3: include pairs, local context and global context
        :return:
        """
        tokenizer = lambda text: [x for x in text]

        TEXT = data.Field(sequential=True, tokenize=tokenizer)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df, test_df, val_df = self.get_my_pandas_df(
            pairs_pkl, self.config.context_flag)

        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        val_examples = [
            data.Example.fromlist(i, datafields)
            for i in val_df.values.tolist()
        ]
        val_data = data.Dataset(val_examples, datafields)

        TEXT.build_vocab(train_data, vectors=Vectors(name=word_embedding_pkl))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            train_data,
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        print('Loaded %d training example' % len(train_data))
        print('Loaded %d test example ' % len(test_data))
        print('Loaded %d validation examples' % len(val_data))
def torchLoad(config):

    TEXT = data.Field(sequential=True, fix_length=config.kwargs['padding_size'])
    LABEL = data.Field(sequential=True,use_vocab=False)

    train = GrandDataset(config.kwargs['raw_train_path'], text_field=TEXT, label_field=LABEL, config=config,test=False)
    val = GrandDataset(config.kwargs['raw_vali_path'], text_field=TEXT, label_field=LABEL, config=config,test=False)
    test = GrandDataset(config.kwargs['raw_test_path'], text_field=TEXT, label_field=None, config=config,test=True)

    cache = '../cache/'
    #读取W2V
    embedding_path = config.kwargs['embedding_path']
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors = Vectors(name=embedding_path, cache=cache)
    vectors.unk_init = init.xavier_uniform_
    print('building {} vocabulary......'.format('Word'))
    TEXT.build_vocab(train, val, test, min_freq=1, vectors=vectors)


    train_iter = data.Iterator(dataset=train, batch_size=config.kwargs['batch_size'], sort=False,shuffle=True,repeat=False,
                                    device=-1)
    val_iter = data.Iterator(dataset=val, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False,
                             device=-1)

    test_iter = data.Iterator(dataset=test, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False,
                              device=-1)

    numerical_dict = TEXT.vocab.stoi

    return train_iter,val_iter,test_iter,TEXT.vocab.vectors,numerical_dict



# torchLoad(config)
Beispiel #13
0
    def test_extend_vectors_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)
        self.assertIsInstance(vecs, Vectors)

        vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir)
        v = MatchingVocab(Counter())
        v.vectors = torch.Tensor(1, vec_data[0].dim)
        v.unk_init = torch.Tensor.zero_
        tokens = {'hello', 'world'}
        v.extend_vectors(tokens, vec_data)
        self.assertEqual(len(v.itos), 4)
        self.assertEqual(v.vectors.size(), torch.Size([4, 300]))
        self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10)
        self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)
Beispiel #14
0
def load_word_vector(train_data,
                     test_data,
                     train_type=None,
                     used_unlabeled_data=None):

    label_vector = pd.Series([
        'background', 'compares', 'contrasts', 'extension', 'future',
        'motivation', 'uses'
    ])
    # Download word vector
    print('Loading word vectors')
    path = os.path.join('/home/g19tka13/wordvector', 'wiki.en.vec')
    if not os.path.exists(path):
        print('Download word vectors')
        import urllib.request
        urllib.request.urlretrieve(
            'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec',
            path)
    vectors = Vectors('wiki.en.vec', cache='/home/g19tka13/wordvector')
    vocab = Vocab(collections.Counter(
        count_all_words(train_data['citation_context'].append(
            test_data['citation_context'],
            ignore_index=True).append(label_vector, ignore_index=True))),
                  specials=['<pad>', '<unk>'],
                  vectors=vectors)
    return vocab
Beispiel #15
0
def vocab_from_vectors(vector_kwargs_list, vocab_kwargs):
    r"""Get Vocab object encompassing all the words in each vector list items.
    Each item in vector_kwargs_list corresponds to the kwargs needed to obtain
    individual vector lookup table. All of these are combined by concatenation
    to get a unified vocab object.
    
    NOTE: Since multiple vectors can be used, vector_kwargs_list must contain
    argument names even for positional arguments. Incase of vocab_kwargs, counter 
    and vectors will be inferred and hence need not be provided."""

    assert len(vector_kwargs_list) > 0
    vocab_kwargs = deepcopy(vocab_kwargs)

    # obtain vectors and counter from list of vector creating keyword arguments
    vectors = list()
    vocab_kwargs["counter"] = Counter()

    for kwargs in vector_kwargs_list:
        vecs = Vectors(**kwargs)
        vectors.append(vecs)
        vocab_kwargs["counter"].update(vecs.itos)

    vocab_kwargs["vectors"] = vectors
    vocab = Vocab(**vocab_kwargs)

    return vocab
Beispiel #16
0
def PT_preprocessing(bsize=10, bptt=32, shuf=True, cuda=-1):
    # Our input $x$
    TEXT = torchtext.data.Field()

    # Language Modelling Dataset from the Penn Treebank
    # http://aclweb.org/anthology/J93-2004
    train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
        path=".",
        train="train.txt",
        validation="valid.txt",
        test="valid.txt",
        text_field=TEXT)

    #Full length vocab build
    TEXT.build_vocab(train)

    #Batching
    train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
        (train, val, test),
        batch_size=bsize,
        device=cuda,
        bptt_len=bptt,
        repeat=False,
        shuffle=shuf)

    # Build the vocabulary with word embeddings
    url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
    TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

    return train_iter, val_iter, test_iter, TEXT


#train_iter, val_iter, test_iter, TEXT = pp.PT_preprocessing(bsize=bsize, bptt=bptt, shuf=False, cuda=cuda)
Beispiel #17
0
def preprocess_data(args):
    TEXT = torchtext.data.Field(lower=True)
    train, dev, test = torchtext.datasets.LanguageModelingDataset.splits(
        path='./data',
        train='text8.train.txt',
        validation='text8.dev.txt',
        test='text8.test.txt',
        text_field=TEXT)
    cache = 'mycache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    vectors = Vectors(
        name='/Users/zhoup/wordEmbedding/glove/glove.6B.300d.txt', cache=cache)
    # vectors.unk_init = nn.init.xavier_uniform_

    TEXT.build_vocab(train, vectors=vectors)
    VOCAB_SIZE = len(TEXT.vocab)
    train_iter, dev_iter, test_iter = torchtext.data.BPTTIterator.splits(
        (train, dev, test),
        batch_size=args.batch_size,
        device=args.device,
        bptt_len=100,
        repeat=False,
        shuffle=True,
    )
    return VOCAB_SIZE, train_iter, dev_iter, test_iter, TEXT.vocab.vectors
    def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)
Beispiel #19
0
 def __init__(self, langs, cache):
     self.langs = langs
     self.lEmbed = {}
     self.lExtracted = {}
     for lang in self.langs:
         print(f'Loading vectors for {lang}...')
         self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache)
Beispiel #20
0
def main(n_epochs, learning_rate):
    # Text text processing library and methods for pretrained word embeddings

    # Our input $x$
    TEXT = torchtext.data.Field()

    # Our labels $y$
    LABEL = torchtext.data.Field(sequential=False)

    train_dataset, val_dataset, test_dataset = torchtext.datasets.SST.splits(
        TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

    TEXT.build_vocab(train_dataset)
    LABEL.build_vocab(train_dataset)

    train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
        (train_dataset, val_dataset, test_dataset), batch_size=10, device=-1)

    url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
    TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

    W, b = train_CBOW(train_iter, val_iter, TEXT, learning_rate, n_epochs)

    upload = []
    true = []
    for batch in test_iter:
        # Your prediction data here (don't cheat!)
        probs = (predict_CBOW(batch, TEXT, W, b) > 0.5).long()
        upload += list(probs.data)
        true += batch.label.data.numpy().tolist()
    true = [x if x == 1 else 0 for x in true]
    print("test accuracy:")
    print(sum([(x == y) for x, y in zip(upload, true)]) / len(upload))
    def test_vocab_download_custom_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=Vectors('wiki.simple.vec',
                                            url=FastText.url_base.format('simple')))

            self.assertEqual(v.itos, ['<unk>', '<pad>', '<bos>',
                                      'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)
Beispiel #22
0
 def __init__(self,
              data_folder,
              dataset_name,
              split,
              text_vector=None,
              device=None):
     vectors = Vectors(
         name=text_vector,
         cache='.vector_cache') if text_vector is not None else None
     self.img_path = os.path.join(
         data_folder, "{}_IMAGES_{}.hdf5".format(dataset_name, split))
     self.caption_path = os.path.join(
         data_folder, "{}_CAPTIONS_{}.hdf5".format(dataset_name, split))
     self.h = h5py.File(self.img_path, 'r')
     self.imgs = self.h['images']
     with open(self.caption_path, 'r', encoding='utf-8') as f:
         self.captions = f.read().splitlines()
     assert len(self.captions) == len(self.imgs)
     self.caption2img = []
     self.caption_tokens = []
     for i, caption in enumerate(self.captions):
         sentences = caption.split('|||')
         for sentence in sentences:
             self.caption_tokens.append(tokenizer(sentence))
             self.caption2img.append(i)
     self.max_length = max([len(token) for token in self.caption_tokens])
     self.corpus = data.Field(eos_token='<eos>',
                              init_token='<bos>',
                              unk_token='<unk>',
                              sequential=True,
                              fix_length=self.max_length)
     self.corpus.build_vocab(self.caption_tokens, vectors=vectors)
     self.caption_tokens = self.corpus.pad(self.caption_tokens)
Beispiel #23
0
    def test_extend_vocab_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        mf = MatchingField()
        lf = MatchingField(id=True, sequential=False)
        fields = [('id', lf), ('left_a', mf), ('right_a', mf), ('label', lf)]
        col_naming = {
            'id': 'id',
            'label': 'label',
            'left': 'left_',
            'right': 'right_'
        }

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)

        data_path = os.path.join(test_dir_path, 'test_datasets',
                                 'sample_table_small.csv')
        md = MatchingDataset(fields, col_naming, path=data_path)

        mf.build_vocab()
        mf.vocab.vectors = torch.Tensor(len(mf.vocab.itos), 300)
        mf.extend_vocab(md, vectors=vecs)
        self.assertEqual(len(mf.vocab.itos), 6)
        self.assertEqual(mf.vocab.vectors.size(), torch.Size([6, 300]))
Beispiel #24
0
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, validation, test = cls.splits(path)

        cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)

        return BucketIterator.splits((train, validation, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
Beispiel #25
0
def dataset2iter(workpath=WORK_PATH, data_path=DATA_PATH):
    fields = [("PhraseId", None), ("SentenceId", None), ('Phrase', TEXT),
              ('Sentiment', LABEL)]

    data_all = TabularDataset(path=WORK_PATH + '\\' + DATA_PATH,
                              format='tsv',
                              fields=fields)
    data_train, data_valid, data_test = data_all.split(
        split_ratio=[0.6, 0.2, 0.2])

    pretrained_vectors = Vectors(name=WORK_PATH + '\\' + TRAINED_VECTORS +
                                 '.txt',
                                 cache=WORK_PATH)
    TEXT.build_vocab(data_train, vectors=pretrained_vectors)
    LABEL.build_vocab(data_train)

    iter_train = BucketIterator(data_train,
                                batch_size=BATCH_SIZE,
                                sort_key=lambda x: len(x.Phrase),
                                sort=True)
    iter_valid = BucketIterator(data_valid,
                                batch_size=BATCH_SIZE,
                                train=False,
                                sort_key=lambda x: len(x.Phrase))
    iter_test = BucketIterator(data_test,
                               batch_size=BATCH_SIZE,
                               train=False,
                               sort_key=lambda x: len(x.Phrase))
    return iter_train, iter_valid, iter_test
Beispiel #26
0
def load_data(word_vec_path, cache_path: str, max_vocab_size: int = 25000):
    """Load data from local files.
    When executed first time without '.data/' folder in path,
    the text data will be automatically downloaded.

    :param word_vec_path: str, path of pre-trained word vector data file
    :param max_vocab_size: int, maximum vocabulary size
    :param cache_path: str, path of word vector cache storage
    :return: text, label and tuple of (train, valid, test) data
    """

    torch.manual_seed(seed)
    # Improve efficiency by using deterministic convolution.
    torch.backends.cudnn.deterministic = True
    # Improve efficiency as the input size of model doesn't change.
    torch.backends.cudnn.benchmark = True
    text = data.Field(tokenize='spacy')
    label = data.LabelField(dtype=torch.float)
    # Split dataset into train, valid and test sets.
    train_data, test_data = datasets.IMDB.splits(text, label)
    # Load word vector from local file.
    word_vec = Vectors(name=word_vec_path, cache=cache_path)
    # Build vocabulary.
    text.build_vocab(train_data,
                     max_size=max_vocab_size,
                     vectors=word_vec,
                     unk_init=torch.Tensor.normal_)
    label.build_vocab(train_data)
    return text, label, (train_data, test_data)
Beispiel #27
0
def get_wv_embedding(name, embed_size, vocab):
    """Construct embedding tensor.

    Args:
        name (str): Which GloVe embedding to use.
        embed_size (int): Dimensionality of embeddings.
        vocab: Vocabulary to generate embeddings.
    Returns:
        embedding (vocab_size, embed_size): Tensor of
            GloVe word embeddings.
    """
    """    for index, w in zip(vocab.values(), vocab.keys()):
        if w in list(word_vecs.wv.vocab):
            vec = model[w]
        else:
            vec = np.random.uniform(-0.25,0.25, embed_size)
        embedding[index] = vec    
    
    glove = torchtext.vocab.GloVe(name=name,
                                  dim=str(embed_size))
    """
    #name='/home/sarroutim2/PosDoc NLM/Question Answering/Embedding and pretained models/wikipedia-pubmed-and-PMC-w2v.txt'
    w2v = Vectors(name=name)  ##cache='.vector_cache/wiki-PubMed-w2v.txt.pt'
    vocab_size = len(vocab)
    embedding = torch.zeros(vocab_size, embed_size)
    for i in range(vocab_size):
        embedding[i] = w2v[vocab.idx2word[str(i)]]

    return embedding
Beispiel #28
0
    def __init__(self, args):
        super(TextCNN, self).__init__(args)

        self.class_num = args.class_num
        self.chanel_num = 1
        self.filter_num = args.filter_num
        self.filter_sizes = args.filter_sizes

        self.vocabulary_size = args.vocabulary_size
        self.embedding_dimension = args.embedding_dim
        self.embedding = nn.Embedding(self.vocabulary_size,
                                      self.embedding_dimension).to(DEVICE)
        if args.static:
            logger.info('logging word vectors from {}'.format(
                args.vector_path))
            vectors = Vectors(args.vector_path).vectors
            self.embedding = self.embedding.from_pretrained(
                vectors, freeze=not args.non_static).to(DEVICE)
        if args.multichannel:
            self.embedding2 = nn.Embedding(
                self.vocabulary_size,
                self.embedding_dimension).from_pretrained(
                    args.vectors).to(DEVICE)
            self.chanel_num += 1
        else:
            self.embedding2 = None
        self.convs = nn.ModuleList([
            nn.Conv2d(self.chanel_num, self.filter_num,
                      (size, self.embedding_dimension))
            for size in self.filter_sizes
        ]).to(DEVICE)
        self.dropout = nn.Dropout(args.dropout).to(DEVICE)
        self.fc = nn.Linear(
            len(self.filter_sizes) * self.filter_num,
            self.class_num).to(DEVICE)
Beispiel #29
0
    def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param pt_file: load cached embedding file from disk if it is true
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        train, validation, test = cls.splits(path)
        if not pt_file:
            if vectors is None:
                vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init)
            cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)
        else:
            cls.TEXT_FIELD.build_vocab(train, validation, test)
            cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name))

        cls.LABEL_FIELD.build_vocab(train, validation, test)

        cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab)

        return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle,
                                     sort_within_batch=True, device=device)
Beispiel #30
0
def get_dataiter(path, sequence, type = 'train'):
    lables, sentence1s, sentence2s = [], [], []
    #定义三个field
    label = data.Field(sequential = False, use_vocab = False)
    # 获取数据
    with open(path, 'r') as fp:
        dataset = fp.readlines()
    # 整理成examples
    examples = []
    fields = [('seq_1',sequence), ('seq_2',sequence), ('label',label)]
    for idx in range(1, len(dataset)):
        label = dataset[idx].split('\t')[0]
        seq_1 = dataset[idx].split('\t')[5]
        seq_2 = dataset[idx].split('\t')[6]
        if label == '-':
            continue
        examples.append(data.Example.fromlist([seq_1, seq_2 ,label_to_idx[label]], fields))
    # 构建Daraset数据集
    data_set = data.Dataset(examples, fields)
    print(len(data_set))
    # 如果是训练集则进行构造词典
    if type == 'train':
        sequence.build_vocab(data_set, vectors = Vectors(args.glove840_path))
        dataiter = data.BucketIterator(data_set, batch_size= args.batch_size, shuffle = True)
        return sequence, dataiter
    else:
        dataiter = data.BucketIterator(data_set, batch_size= args.batch_size, shuffle = False)
        return dataiter