def ptb(cls, text_field, batch_size=16, device=-1, vector: Optional[str] = None, **kwargs): train, valid, test = cls.splits(text_field, **kwargs) train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), batch_sizes=(batch_size, batch_size, batch_size), device=device, shuffle=True, repeat=True, sort_key=lambda x:len(x.text), **kwargs) if vector == 'glove_6B': vectors = GloVe('6B', dim=300) elif vector == 'glove_840B': vectors = GloVe('840B', dim=300) elif vector == 'glove_42B': vectors = GloVe('42B', dim=300) try: text_field.build_vocab(train, valid, test, vectors=vectors) except UnboundLocalError: print('No word embedding loaded.') text_field.build_vocab(train, valid, test) return (iter(train_iter), iter(valid_iter), iter(test_iter)), text_field
def test_vectors_get_vecs(self): vec = GloVe(name='twitter.27B', dim='25') self.assertEqual(vec.vectors.shape[0], len(vec)) tokens = ['chip', 'baby', 'Beautiful'] token_vecs = vec.get_vecs_by_tokens(tokens).numpy() self.assertEqual(token_vecs.shape[0], len(tokens)) self.assertEqual(token_vecs.shape[1], vec.dim) assert_allclose(vec[tokens[0]].numpy(), token_vecs[0]) assert_allclose(vec[tokens[1]].numpy(), token_vecs[1]) assert_allclose(vec['<unk>'].numpy(), token_vecs[2]) token_one_vec = vec.get_vecs_by_tokens(tokens[0], lower_case_backup=True).numpy() self.assertEqual(token_one_vec.shape[0], vec.dim) assert_allclose(vec[tokens[0].lower()].numpy(), token_one_vec) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": zip_file = os.path.join(self.project_root, ".vector_cache", "glove.6B.zip") conditional_remove(zip_file) for dim in ["50", "100", "200", "300"]: conditional_remove( os.path.join(self.project_root, ".vector_cache", "glove.6B.{}d.txt".format(dim)))
def __init__(self, data_path, alphabet_path, is_labeled=True, l0=501, l1=131, max_samples=None, word_emb_name="twitter.27B", word_emb_dim=200, vector_cache_path=None): """A dataset object whose samples consist of *both* - the (padded) concatenation of the word vectors of a tweet, and - the per-character one-hot encoding of the same tweet. Arguments: data_path: path of (label and) data file in csv. alphabet_path: path of alphabet json file. is_labeled: whether the data_path file contains labels, or only the tweets. l1: max length of a sample, in nb of characters. l1: max length of a sample, in nb of words. max_samples: (for dev,) only keep the max_samples first samples of the data. word_emb_name: name of the word embedding to use, used by torchtext.GloVe. word_emb_dim: dimension of the word embedding to use, used by torchtext.GloVe. vector_cache_path: path to cache directory, used by torchtext.GloVe. """ self.glove = GloVe(name=word_emb_name, dim=word_emb_dim, cache=vector_cache_path) print("loaded pretrained GloVe word-embeddings.") self.data_path = data_path self.alphabet_path = alphabet_path self.is_labeled = is_labeled self.l0 = l0 self.l1 = l1 with open(alphabet_path) as f: self.alphabet = ''.join(json.load(f)) self.raw_nb_feats = len(self.alphabet) self.pro_nb_feats = word_emb_dim # TODO: setting max_samples only makes sense if the csv itself was shuffled # X_txt = pd.read_csv(data_path, nrows=max_samples) # only keep max_samples first samples, or keep all if None X_txt = pd.read_csv(data_path) if max_samples: assert is_labeled, "must not use `max_samples` for unlabeled (assumed test-) data, as shuffling would modify the samples' ordering" X_txt = X_txt.sample(frac=1).reset_index( drop=True ).iloc[:max_samples] # shuffle then select max_samples first self.y = X_txt['label'].to_numpy().astype( np.integer, copy=False) if is_labeled else None self.X_pro = X_txt['preprocessed_segmented_tweet'].to_numpy() self.X_raw = X_txt['raw_tweet'].to_numpy()
def _process_movie_fea(self): """ Parameters ---------- movie_info : pd.DataFrame name : str Returns ------- movie_features : np.ndarray Generate movie features by concatenating embedding and the year """ if self._name == 'ml-100k': GENRES = GENRES_ML_100K elif self._name == 'ml-1m': GENRES = GENRES_ML_1M elif self._name == 'ml-10m': GENRES = GENRES_ML_10M else: raise NotImplementedError TEXT = data.Field(tokenize='spacy') embedding = GloVe(name='840B', dim=300) title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300), dtype=np.float32) release_years = np.zeros(shape=(self.movie_info.shape[0], 1), dtype=np.float32) p = re.compile(r'(.+)\s*\((\d+)\)') for i, title in enumerate(self.movie_info['title']): match_res = p.match(title) if match_res is None: print('{} cannot be matched, index={}, name={}'.format( title, i, self._name)) title_context, year = title, 1950 else: title_context, year = match_res.groups() # We use average of glove title_embedding[i, :] = embedding.get_vecs_by_tokens( TEXT.tokenize(title_context)).numpy().mean(axis=0) release_years[i] = float(year) movie_features = np.concatenate( (title_embedding, (release_years - 1950.0) / 100.0, self.movie_info[GENRES]), axis=1) return movie_features
def __init__(self, emb_dim=50, mbsize=32): self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=None) self.LABEL = data.Field(sequential=False, unk_token=None) # Only take sentences with length <= 15 f = lambda ex: len(ex.text) <= 15 train, test = bookreader.BookReader.splits(self.TEXT, self.LABEL, filter_pred=f) self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim)) self.LABEL.build_vocab(train) self.n_vocab = len(self.TEXT.vocab.itos) self.emb_dim = emb_dim self.train_iter, _ = data.BucketIterator.splits((train, test), batch_size=mbsize, device=-1, shuffle=True)
def load_TREC_data(batch_size=32, embedding_length=100, fix_length=10): # set up fields tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=fix_length) # LABEL = data.LabelField() LABEL = data.LabelField(dtype=torch.float) # make splits for data train, test = datasets.TREC.splits(TEXT, LABEL) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length)) LABEL.build_vocab(train) # make iterator for splits train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=batch_size, device=0) word_embeddings = TEXT.vocab.vectors vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, test_iter
def load_datasets(test_sen=None): tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = data.LabelField(dtype=torch.float) train = data.TabularDataset(path='/content/query_classifier_data.csv', format='tsv', fields=[("question",TEXT), ("label",LABEL)], skip_header=True) train_data, test_data = train.split(random_state=random.getstate()) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors print ("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print ("Label Length: " + str(len(LABEL.vocab))) train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.question), repeat=False, shuffle=True) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def _initialize_glove(self) -> torch.Tensor: r""" Initialize embeddings of all the tokens in a given :class:`~allennlp.data.vocabulary.Vocabulary` by their GloVe vectors. Extended Summary ---------------- It is recommended to train an :class:`~updown.models.updown_captioner.UpDownCaptioner` with frozen word embeddings when one wishes to perform Constrained Beam Search decoding during inference. This is because the constraint words may not appear in caption vocabulary (out of domain), and their embeddings will never be updated during training. Initializing with frozen GloVe embeddings is helpful, because they capture more meaningful semantics than randomly initialized embeddings. Returns ------- torch.Tensor GloVe Embeddings corresponding to tokens. """ glove = GloVe(name="42B", dim=300) glove_vectors = torch.zeros(self._vocabulary.get_vocab_size(), 300) for word, i in self._vocabulary.get_token_to_index_vocabulary().items( ): if word in glove.stoi: glove_vectors[i] = glove.vectors[glove.stoi[word]] elif word != self._pad_index: # Initialize by random vector. glove_vectors[i] = 2 * torch.randn(300) - 1 return glove_vectors
def __init__(self, vocab_size, max_len, embedding_size, hidden_size, use_glove, sos_id, eos_id, model_name, n_layers=1, rnn_cell='lstm', bidirectional=False, input_dropout_p=0, dropout_p=0, use_attention=False ): super(DecoderRNN, self).__init__(vocab_size, max_len, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell) self.bidirectional_encoder = bidirectional self.rnn = self.rnn_cell(embedding_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional) self.output_size = vocab_size self.max_length = max_len self.use_attention = use_attention self.eos_id = eos_id self.sos_id = sos_id self.init_input = None self.use_glove = use_glove if self.use_glove: embedding_glove = GloVe(name="twitter.27B", dim=200) self.embedding = embedding_glove.vectors self.rnn = self.rnn_cell(200, hidden_size, n_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional) else: self.embedding = nn.Embedding(self.output_size, embedding_size) if use_attention: self.attention = Attention(self.hidden_size, model_name) self.out = nn.Linear(self.hidden_size, self.output_size)
def __init__(self, emb_dim=50, mbsize=32, main=True, dataset2=None, **kwargs): self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=None) self.LABEL = data.Field(sequential=False, unk_token=None) train, test = datasets.IMDB.splits( self.TEXT, self.LABEL, filter_pred=utils.filter(6) ) self.train = train if main: train_datasets = [train.text, dataset2.get_train().text] \ if dataset2 else [train] self.TEXT.build_vocab(*train_datasets, vectors=GloVe('6B', dim=emb_dim)) self.LABEL.build_vocab(train) self.n_vocab = len(self.TEXT.vocab.itos) print(self.n_vocab) self.emb_dim = emb_dim self.train_iter, _ = data.BucketIterator.splits( (train, test), batch_size=mbsize, device=-1, shuffle=True, repeat=True ) self.train_iter = iter(self.train_iter)
def _initialize_glove(self): assert self.embedding_size == 300 glove = GloVe(name="42B", dim=self.embedding_size) caption_oov = 0 glove_caption_tokens = torch.zeros(self._vocabulary.get_vocab_size(), self.embedding_size) for word, i in self._vocabulary.get_token_to_index_vocabulary().items( ): if word in glove.stoi: glove_caption_tokens[i] = glove.vectors[glove.stoi[word]] else: # use a random vector instead caption_oov += 1 glove_caption_tokens[i] = 2 * torch.randn( self.embedding_size) - 1 print("Caption OOV: %d / %d = %.2f" % (caption_oov, self.vocab_size, 100 * caption_oov / self.vocab_size)) for p in self._output_layer.parameters(): p.requires_grad = False self._output_layer.weight.copy_(glove_caption_tokens) for p in self._embedding_layer.parameters(): p.requires_grad = False self._embedding_layer.weight.copy_(glove_caption_tokens)
def iters(cls, batch_size=32, device=-1): TEXT = data.Field(include_lengths=True) LABEL = data.Field(sequential=False, use_vocab=False) ID = data.Field(sequential=False) train, val, test = cls.splits(path='.', train='train.csv', skip_header=True, validation='val.csv', test='dev.csv', format='csv', fields=[ ('id', ID), ('project_title', TEXT), ('project_resource_summary', TEXT), ('project_essay_1', TEXT), ('project_essay_2', TEXT), ('project_is_approved', LABEL) ]) #vocab is shared across all the text fields #CAUTION: GloVe will download all embeddings locally (862 MB). If not interested, remove "vectors" TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) ID.build_vocab(train) return data.BucketIterator.splits((train, val, test), batch_size=batch_size, device=device)
def load_data(path, batch_size, max_seq_length, glove="840B", emb_size=300): TEXT = Field(sequential=True, fix_length=max_seq_length, lower=True) LABEL = Field(sequential=False, use_vocab=False) ID = Field(sequential=False, use_vocab=False) data_fields = [("id", ID), ("sent", TEXT), ("label", LABEL)] train_path = os.path.join(path, "train.csv") train = TabularDataset(path=train_path, format="csv", skip_header=False, fields=data_fields) test_path = os.path.join(path, "dev.csv") test = TabularDataset(path=test_path, format="csv", skip_header=False, fields=data_fields) TEXT.build_vocab(train, vectors=GloVe(name=glove, dim=emb_size)) LABEL.build_vocab(train) vocab_size = len(TEXT.vocab) vocab_weights = TEXT.vocab.vectors train_iter = BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x: x.id, shuffle=True, repeat=False) test_iter = BucketIterator(dataset=test, batch_size=batch_size, sort_key=lambda x: x.id, shuffle=False, repeat=False) return train_iter, test_iter, vocab_size, vocab_weights
def load_imdb(args): TEXT = data.Field(lower=True, tokenize=tokenize, batch_first=True, fix_length=args.max_seq_len) LABEL = data.LabelField(dtype=torch.long) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root=args.data_dir) # build a vocabulary TEXT.build_vocab(train_data, max_size=args.max_vocab_size - 2, vectors=GloVe(name='6B', dim=args.embedding_size)) LABEL.build_vocab(train_data) train_iter, test_iter = data.BucketIterator.splits( (train_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text)) n_token = len(TEXT.vocab) n_classes = len(LABEL.vocab) print("{} unique tokens in TEXT vocabulary".format(n_token)) print("{} class labels".format(n_classes)) return train_iter, test_iter, n_token, n_classes, TEXT.vocab.vectors
def __init__(self, args): super(ARC, self).__init__(args) self.LABEL = data.Field(sequential=False, unk_token=None, tensor_type=torch.FloatTensor) self.train, self.dev, self.test = data.TabularDataset.splits( path='.data/arc/preprocessed/single', train='train.txt', validation='dev.txt', test='test.txt', format='tsv', skip_header=True, fields=[('id', self.RAW), ('warrant', self.TEXT), ('label', self.LABEL), ('reason', self.TEXT), ('claim', self.TEXT), ('debateTitle', self.TEXT), ('debateInfo', self.TEXT)]) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) self.LABEL.build_vocab(self.train) self.sort_key = lambda x: len(x.warrant) + len(x.reason) + len(x.claim) self.train_iter, self.dev_iter, self.test_iter = data.Iterator.splits( (self.train, self.dev, self.test), batch_sizes=[self.args.batch_size, 256, 256], device=self.args.gpu, sort_key=self.sort_key) self.dev_iter.sort = False self.dev_iter.sort_within_batch = False self.test_iter.sort = False self.test_iter.sort_within_batch = False
def __init__(self, emb_dim=50, mbsize=32): # self.TEXT = data.Field(tokenize = 'spacy', include_lengths = True) # self.LABEL = data.LabelField(dtype = torch.float) self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) self.LABEL = data.Field(sequential=False, unk_token=None) # Only take sentences with length <= 15 f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral' train, test = datasets.IMDB.splits( self.TEXT, self.LABEL ) self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim)) self.LABEL.build_vocab(train) self.n_vocab = len(self.TEXT.vocab.itos) self.emb_dim = emb_dim self.train_iter, self.test_iter = data.BucketIterator.splits( (train, test), batch_size=mbsize, device=-1, shuffle=True, repeat=True ) self.train_iter = iter(self.train_iter) self.test_iter = iter(self.test_iter)
def load_data(batch_size=16, embedding_length=100): tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=30) LABELS = data.LabelField(batch_first=True, dtype=torch.float) train, val, test = data.TabularDataset.splits( path='app/data/sentiment_data/', train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', fields=[('text', TEXT), ('labels', LABELS)]) # train_iter, val_iter, test_iter = data.BucketIterator.splits( # (train, val, test), batch_sizes=(batch_size, batch_size, batch_size), sort_key=lambda x: len(x.text), device=0) # # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length)) # LABELS.build_vocab(train) # print(LABELS.vocab.__dict__) # word_embeddings = TEXT.vocab.vectors # vocab_size = len(TEXT.vocab) return TEXT
def load_dataset(test_sen=None): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 40. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize_en, lower=True, include_lengths=True, batch_first=True, fix_length=40) LABEL = data.LabelField(dtype=torch.float) fields = [(None, None), (None, None), ('text', TEXT), ('label', LABEL)] train_data, valid_data, test_data = data.TabularDataset.splits( path='', train='V1.4_Training_original.csv', validation='SubtaskA_Trial_Test_Labeled - Copy.csv', test='SubtaskA_EvaluationData_labeled.csv', # train = 'train_spacy.csv', # validation = 'valid_spacy.csv', # test = 'test_spacy.csv', # #sort_key=lambda x: len(x.Text), format='csv', fields=fields, skip_header=True) print(vars(train_data[0])) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=100)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) # train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=64, sort_key=lambda x: len(x.text), repeat=False, shuffle=True, device=device) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def __init__(self, args): super().__init__(args) fix_length = args.max_sent_len if args.max_sent_len >= 0 else None self.TEXT = data.Field(batch_first=True, init_token="<s>", eos_token="</s>", preprocessing=preprocessor, fix_length=fix_length, include_lengths=True, tokenize="spacy") self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = datasets.SNLI.splits( self.TEXT, self.LABEL) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) self.build_char_vocab() self.LABEL.build_vocab(self.train) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=[args.batch_size] * 3, device=torch.device('cuda', args.gpu) if args.gpu >= 0 else torch.device('cpu'), repeat=False)
def __init__(self, batch_size, word_dim): self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = data.TabularDataset.splits( path='.data/quora', train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', fields=[('label', self.LABEL), ('q1', self.TEXT), ('q2', self.TEXT), ('id', self.RAW)]) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='6B', dim=word_dim)) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), device=-1, batch_sizes=[batch_size] * 3, sort_key=sort_key) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
def __init__(self, emb_dim=50, mbsize=32): self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) self.LABEL = data.Field(sequential=False, unk_token=None) # Only take sentences with length <= 15 f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral' train, val, test = datasets.SST.splits(self.TEXT, self.LABEL, fine_grained=False, train_subtrees=False, filter_pred=f) self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim)) self.LABEL.build_vocab(train) self.n_vocab = len(self.TEXT.vocab.itos) self.emb_dim = emb_dim self.train_iter, self.val_iter, _ = data.BucketIterator.splits( (train, val, test), batch_size=mbsize, device=-1, shuffle=True, repeat=True) self.train_iter = iter(self.train_iter) self.val_iter = iter(self.val_iter)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default=None, type=str, required=True) parser.add_argument('--w2v_path', default=None, type=str, required=True) parser.add_argument('--labels', default=None, type=str, required=True) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--epochs', default=3, type=int) parser.add_argument('--logging_steps', default=20, type=int) parser.add_argument('--learning_rate', default=5e-3, type=float) args = parser.parse_args() args.device = torch.device('cuda') labels = get_labels(args.labels) glove = GloVe(cache=args.w2v_path) # model model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512) model.to(args.device) # dataset train_dataset = NerDataset(args.data_dir, labels, glove, mode='train') eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev') # train train(args, model, train_dataset) # eval result = eval(args, model, eval_dataset, labels) print(result)
def vocab_builder(self): #self.eid_field = Field(sequential=False,tokenize) print('Build Vocabulary') tokenize = BiGraphTextDataset.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt' train_data = TabularDataset(path=path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi_dict = TEXT.vocab.stoi self.vocab_vectors = TEXT.vocab.vectors
def get_all_vectors(pretrained_model): emb_vectors = [] if pretrained_model == "": return emb_vectors emb_vector_names = pretrained_model.split(",") for emb_vector_name in emb_vector_names: emb_info = emb_vector_name.split("_") if len(emb_info) == 3: emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2] else: emb_name, emb_set = emb_info[0], emb_info[1] if emb_name == "glove": # glove_640B_300 print("glove") emb_vectors.append(GloVe(name=emb_set, dim=emb_size)) elif emb_name == "fasttext": if emb_set == "subwordcc": # fasttext_subwordcc print("fasttext_subwordcc") emb_vectors.append(FastTextSubwordCC()) elif emb_set == "wiki": # fasttext_wiki_en print("fasttext_wiki") emb_vectors.append(FastText(language=emb_size)) elif emb_set == "cc": # fasttext_cc_en print("fasttext_cc") emb_vectors.append(FastTextCC(language=emb_size)) elif emb_name == "char": # char_ngram if emb_set == "ngram": print("char_ngram") emb_vectors.append(CharNGram()) return emb_vectors
def __init__(self, path, text_field, label_field, pad_length, eval): """Create an IMDB dataset instance given a path and fields. Arguments: path: Path to the dataset's highest level directory text_field: The field that will be used for text data. label_field: The field that will be used for label data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ self.eval = eval self.dataset = [] dataframe = pd.read_csv(path, sep="\t", encoding="utf-8") for i in range(dataframe.shape[0]): self.dataset.append({ 'text': text_field.preprocess(dataframe['text'][i]), 'label': dataframe['label'][i] }) if not self.eval: text_field.build_vocab( [t['text'] for t in dataset], vectors=GloVe(name='6B', dim=args.embedding_dims) ) label_field.build_vocab(t['label'] for t in dataset) self.TEXT = text_field self.LABEL = label_field self.pad_length = pad_length
def get_data(): # set up fields TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy') LABEL = data.Field(sequential=False) # make splits for data print("Accessing raw input and preprocessing") train, val, test = datasets.SNLI.splits(TEXT, LABEL, root='.data', train='snli_1.0_train.jsonl', validation='snli_1.0_dev.jsonl', test='snli_1.0_test.jsonl') print("done") # build the vocabulary print("Building vocabulary with GloVe") TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=300)) LABEL.build_vocab(train) print("done") # make iterator for splits print("Loading data into iterables") train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=64, device="cuda") print("done, returning data") ### text contains metadata, returning it return train_iter, val_iter, test_iter, TEXT, LABEL
def imdb_detail_get(): # set up fields TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize=(lambda x: x)) # TEXT = data.Field(sequential=False) LABEL = data.Field(sequential=False) # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) # print information about the data print('>>> train.fields', train.fields) print('>>> len(train)', len(train)) print('>>> vars(train[0])', vars(train[0])) print('>>> vars(test[0])', vars(test[0])) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # print vocab information print('>>> len(TEXT.vocab)', len(TEXT.vocab)) print('>>> TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) return train, test
def get_dataset(train_df, valid_df, batch_size, cache=None, gpus=1, vectors=None): TEXT = data.Field(init_token='<START>', eos_token='<END>', tokenize=None, tokenizer_language='en', batch_first=True, lower=True, stop_words=set(string.punctuation)) LABEL = data.Field(dtype=torch.float, is_target=True, unk_token=None, sequential=False, use_vocab=False) train_dataset = DataFrameDataset(train_df, { 'text': TEXT, 'label': LABEL }) val_dataset = DataFrameDataset(valid_df, { 'text': TEXT, 'label': LABEL }) train_loader, val_loader = BucketIterator.splits( (train_dataset, val_dataset), batch_size=batch_size, sort_key=lambda x: len(x.text), device='cuda' if torch.cuda.is_available() and gpus else 'cpu' ) embeddings = vectors if vectors is not None else GloVe('42B', cache=cache) TEXT.build_vocab(train_dataset.text, vectors=embeddings) return TEXT, LABEL, train_loader, val_loader
def __init__(self, args): self.TEXT = data.Field(batch_first=True, tokenize=word_tokenize, lower=True) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = datasets.SNLI.splits( self.TEXT, self.LABEL) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) self.LABEL.build_vocab(self.train) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=[args.batch_size] * 3, device=args.gpu) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.use_char_emb: self.build_char_vocab()
def _data_reader(self): if self.datatype == "test": dataset = data.TabularDataset(path=self.data_path, format='csv', fields=[('label', self.LABEL), ('text', self.TEXT)]) self.LABEL.build_vocab(dataset) self.itol = self.LABEL.vocab.itos elif self.datatype == "train": dataset = data.TabularDataset(path=self.data_path, format='csv', fields=[('text', self.TEXT)]) else: raise Exception("datatype other than train or test...") self.TEXT.build_vocab(dataset, vectors=GloVe(name=self.glove, dim=self.embed_dim, cache=self.vectors_path)) self.dataset = dataset self.data_size = len(dataset) self.embed_matrix = self.TEXT.vocab.vectors self.word_size = len(self.embed_matrix) self.stoi = self.TEXT.vocab.stoi self.itos = self.TEXT.vocab.itos