def make_iterator(self, train): args = self.args t1, t2, t3 = [], [], [] # Segregate training data by its output length for row in train: out_len = len(row.out) if out_len < 100: t1.append(row) elif 100 < out_len < 220: t2.append(row) else: t3.append(row) t1_dataset = data.Dataset(t1, self.fields) t2_dataset = data.Dataset(t2, self.fields) t3_dataset = data.Dataset(t3, self.fields) valid = data.TabularDataset(path=args.path.replace("train", "val"), format='tsv', fields=self.fields) print("Dataset Sizes (t1, t2, t3, valid):", end=' ') for dataset in [t1_dataset, t2_dataset, t3_dataset, valid]: print(len(dataset.examples), end=' ') for row in dataset: row.rawent = row.ent.split(" ; ") row.ent = self.vectorize_entity(row.ent, self.ENT) # row.ent: tuple of ((# of entities in x, max entity len), (# of entities)) row.rel = self.make_graph(row.rel, len(row.ent[1])) row.tgt = row.out row.out = [ y.split("_")[0] + ">" if "_" in y else y for y in row.out ] # row.out: removes tag indices for out (e.g. <method_0> => <method> dataset.fields["tgt"] = self.TARGET dataset.fields["rawent"] = data.RawField() dataset.fields["rawent"].is_target = False self.t1_iter = data.Iterator(t1_dataset, args.t1size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t2_iter = data.Iterator(t2_dataset, args.t2size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t3_iter = data.Iterator(t3_dataset, args.t3size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.val_iter = data.Iterator(valid, args.t3size, device=args.device, sort_key=lambda x: len(x.out), sort=False, repeat=False, train=False)
def load_jester(load_text=True, batch_size=1, subsample_rate=1.0, repeat=False, shuffle=True, ratings_path='jester_ratings.dat.gz', jokes_path='jester_items.clean.dat.gz', max_vocab_size=150, gpu=False): DEV = 0 if gpu else -1 assert os.path.exists( jokes_path), "jokes file %s does not exist!" % jokes_path assert os.path.exists( ratings_path), "ratings file %s does not exist!" % ratings_path text_field = data.Field(lower=True, include_lengths=True, batch_first=True) rating_field = data.Field(sequential=False, use_vocab=False) user_field = data.Field(sequential=False, use_vocab=False) joke_field = data.Field(sequential=False, use_vocab=False) if load_text: fields = [('text', text_field), ('ratings', rating_field), ('users', user_field), ('jokes', joke_field)] else: fields = [('ratings', rating_field), ('users', user_field), ('jokes', joke_field)] jokes_text = {} joke = -1 all_tokens = [] with gzip.open(jokes_path) as f: for i, line in enumerate(f): l = line.decode('utf-8') if len(l.strip()) == 0: continue if l.strip()[-1] == ':': joke = int(l.strip().strip(':')) else: joke_text = l.strip() tokens = l.strip().split() all_tokens.extend(tokens) jokes_text[joke] = joke_text counts = Counter(all_tokens) most_common = counts.most_common(max_vocab_size) most_common = set([item[0] for item in most_common]) print('Loading Data, this might take several minutes') if subsample_rate < 1.0: print('Subsampling rate set to %f' % subsample_rate) train, val, test = [], [], [] with gzip.open(ratings_path) as f: for i, l in enumerate(f): if i % 100000 == 0: print('%d lines read' % i) user, joke, rating = l.split() user = int(user) joke = int(joke) rating = int(rating) if load_text: assert joke in jokes_text example = Example.fromlist([ ' '.join([ item for item in jokes_text[joke].split() if item in most_common ]), rating, user, joke ], fields) else: example = Example.fromlist([rating, user, joke], fields) p = random.random() q = random.random() if p < 0.98: if q < subsample_rate: train.append(example) elif p < 0.99: val.append(example) elif p < 1.0: test.append(example) train = data.Dataset(train, fields) val = data.Dataset(val, fields) test = data.Dataset(test, fields) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=DEV, repeat=repeat, shuffle=shuffle) train_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text' ) else 0 val_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text') else 0 test_iter.sort_key = lambda p: len(p.text) if hasattr(p, 'text') else 0 print('Data Loaded') if load_text: text_field.build_vocab(train) return train_iter, val_iter, test_iter, text_field else: return train_iter, val_iter, test_iter,
Tweet = data.Field(sequential=True, tokenize='spacy', batch_first=True, include_lengths=True) Label = data.LabelField(tokenize='spacy', is_target=True, batch_first=True, sequential=False) col = df.columns fields = [(col[0], Tweet), (col[1], Label)] example = [ data.Example.fromlist([df.tweets[i], df.labels[i]], fields) for i in range(df.shape[0]) ] twitterDataset = data.Dataset(example, fields) (train, valid) = twitterDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED)) # print((len(train), len(valid))) # print(vars(train.examples[10])) Tweet.build_vocab(train) Label.build_vocab(train) # print('Size of input vocab : ', len(Tweet.vocab)) # print('Size of label vocab : ', len(Label.vocab)) # print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10))) # print('Labels : ', Label.vocab.stoi) device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def __init__(self, args): path = '.data/squad' train_examples_paths = [] dev_examples_paths = [] test_examples_paths = [] for i in args.dev_files: dataset_path = path + '/torchtext/' + i.replace("/", "_") + "/" train_examples_paths.append(dataset_path + 'train_examples.pt') dev_examples_paths.append(dataset_path + 'dev_examples.pt') test_examples_paths.append(dataset_path + 'test_examples.pt') print("preprocessing data files...") for i in args.dev_files: if not os.path.exists('{}/{}l'.format(path, i)): self.preprocess_file('{}/{}'.format(path, i)) for i in args.train_files: if not os.path.exists('{}/{}l'.format(path, i)): self.preprocess_file('{}/{}'.format(path, i)) for i in args.test_files: if not os.path.exists('{}/{}l'.format(path, i)): self.preprocess_file('{}/{}'.format(path, i)) self.RAW = data.RawField() # explicit declaration for torchtext compatibility self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] self.train = [] self.dev = [] self.test = [] if all([ os.path.exists(i) for i in train_examples_paths + dev_examples_paths + test_examples_paths ]): print("loading splits...") for i in train_examples_paths: examples = torch.load(i) print(i, ":", len(examples)) self.train.append( data.Dataset(examples=examples, fields=list_fields)) for i in dev_examples_paths: examples = torch.load(i) print(i, ":", len(examples)) self.dev.append( data.Dataset(examples=examples, fields=list_fields)) for i in test_examples_paths: examples = torch.load(i) print(i, ":", len(examples)) self.test.append( data.Dataset(examples=examples, fields=list_fields)) else: print("building splits...") for train_path, dev_path, test_path, i in zip( args.train_files, args.dev_files, args.test_files, range(0, len(args.train_files))): train, dev, test = data.TabularDataset.splits( path=path, train='{}l'.format(train_path), validation='{}l'.format(dev_path), test='{}l'.format(test_path), format='json', fields=dict_fields) try: os.makedirs("".join( os.path.split(train_examples_paths[i])[:-1])) except FileExistsError: pass torch.save(train.examples, train_examples_paths[i]) torch.save(dev.examples, dev_examples_paths[i]) torch.save(test.examples, test_examples_paths[i]) self.train.append(train) self.dev.append(dev) self.test.append(test) #cut too long context in the training set for efficiency. if args.context_threshold > 0: for i in range(0, len(self.train)): print(len(self.train[i].examples)) self.train[i].examples = [ e for e in self.train[i].examples if len(e.c_word) <= args.context_threshold ] print(len(self.train[i].examples)) # self.other_train.examples = [e for e in self.other_train.examples if len(e.c_word) <= args.context_threshold] print("building vocab...") self.CHAR.build_vocab(*self.train, *self.dev, *self.test) self.WORD.build_vocab(*self.train, *self.dev, *self.test, vectors=GloVe(name='6B', dim=args.word_dim)) print("CHAR SIZE", len(self.CHAR.vocab)) print("WORD SIZE", len(self.WORD.vocab)) print("building iterators...") device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") def train_bucket_iter(train): for i in range(0, len(train)): yield data.BucketIterator(train.pop(0), batch_size=args.train_batch_size, device=device, repeat=True, shuffle=True, sort_key=lambda x: len(x.c_word)) self.train_iter = train_bucket_iter(self.train) def dev_bucket_iter(dev): for i in range(0, len(dev)): yield data.BucketIterator(dev.pop(0), batch_size=args.dev_batch_size, device=device, repeat=False, sort_key=lambda x: len(x.c_word)) self.dev_iter = dev_bucket_iter(self.dev) def test_bucket_iter(test): for i in range(0, len(test)): yield data.BucketIterator(test.pop(0), batch_size=args.test_batch_size, device=device, repeat=False, sort_key=lambda x: len(x.c_word)) self.test_iter = test_bucket_iter(self.test)
def get_input_processor_words(inputs, type_model, vocab_word, vocab_char): if "one_sequence" in type_model: inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") inputs_word.vocab = vocab_word inputs_char.vocab = inputs_char_nesting.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) else: inputs_word_query = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_query_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_query = data.NestedField(inputs_char_query_nesting, init_token="<bos>", eos_token="<eos>") inputs_word_document = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_document_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_document = data.NestedField(inputs_char_document_nesting, init_token="<bos>", eos_token="<eos>") fields = ([(('inputs_word_query', 'inputs_char_query'), (inputs_word_query, inputs_char_query)), (('inputs_word_document', 'inputs_char_document'), (inputs_word_document, inputs_char_document))]) inputs_word_query.vocab = inputs_word_document.vocab = vocab_word inputs_char_query.vocab = inputs_char_query_nesting.vocab = \ inputs_char_document_nesting.vocab = inputs_char_document.vocab = vocab_char # print(vocab_word.stoi) # print(vocab_char.stoi) if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: tuple_line = line.split("\t") example = data.Example.fromlist(tuple_line, fields) examples.append(example) dataset = data.Dataset(examples, fields) batchs = data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) # Entire input in one batch return batchs
def word_vector_features(): from torchtext import data def word_mover_distance(model, row, granularity='w'): if granularity == 'w': q1, q2 = row.q1_wid, row.q2_wid return model.wmdistance(q1.split(), q2.split()) else: q1, q2 = row.q1_cid, row.q2_cid return model.wmdistance(q1.split(), q2.split()) # Word Mover Distance filepath = f'../data/word_vectors.txt' tmppath = f'../data/gensim_tmp_word_vector.txt' if not os.path.exists(tmppath): glove2word2vec(filepath, tmppath) word_model = KeyedVectors.load_word2vec_format(tmppath) filepath = f'../data/char_vectors.txt' tmppath = f'../data/gensim_tmp_char_vector.txt' if not os.path.exists(tmppath): glove2word2vec(filepath, tmppath) char_model = KeyedVectors.load_word2vec_format(tmppath) word_wmd = [ word_mover_distance(word_model, row, 'w') for row in concat_df.itertuples(index=False) ] char_wmd = [ word_mover_distance(char_model, row, 'c') for row in concat_df.itertuples(index=False) ] # tf-idf weighted word vector as sentence representation # then calculate cosine similarity, l1-norm, l2-norm word_embedding_path = '../data/word_vectors.txt' char_embedding_path = '../data/char_vectors.txt' cache = '../cache' word_vectors = Vectors(word_embedding_path, cache) char_vectors = Vectors(char_embedding_path, cache) word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05) char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05) wordTEXT = data.Field(batch_first=True) charTEXT = data.Field(batch_first=True) fields = [('q1_word', wordTEXT), ('q2_word', wordTEXT), ('q1_char', charTEXT), ('q2_char', charTEXT)] examples = [ data.Example.fromlist(row, fields) for row in concat_df.itertuples(index=False) ] dataset = data.Dataset(examples, fields) wordTEXT.build_vocab(dataset, min_freq=1, vectors=word_vectors) charTEXT.build_vocab(dataset, min_freq=1, vectors=char_vectors) word_embedding = wordTEXT.vocab.vectors char_embedding = charTEXT.vocab.vectors num_word = word_embedding.size(0) num_char = char_embedding.size(0) word_index2idf = np.zeros(num_word) char_index2idf = np.zeros(num_char) word_counter = Counter() char_counter = Counter() for wid in question_df['wid']: word_counter.update(wid.split()) for cid in question_df['cid']: char_counter.update(cid.split()) N = len(concat_df) # 0 --> <unk> # 1 --> <pad> # start from 2 for i in range(2, num_word): word = wordTEXT.vocab.itos[i] idf = np.log(N / word_counter[word]) word_index2idf[i] = idf for i in range(2, num_char): char = charTEXT.vocab.itos[i] idf = np.log(N / char_counter[char]) char_index2idf[i] = idf device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') word_idf = torch.tensor(word_index2idf, dtype=torch.float32).to(device) char_idf = torch.tensor(char_index2idf, dtype=torch.float32).to(device) word_embedder = nn.Embedding.from_pretrained(word_embedding).to(device) char_embedder = nn.Embedding.from_pretrained(char_embedding).to(device) word_similarity, char_similarity = [], [] word_l1, char_l1 = [], [] word_l2, char_l2 = [], [] iter = data.BucketIterator(dataset, 1024, sort_key=None, shuffle=False, device=torch.device('cuda:0'), sort_within_batch=False) for data in iter: # [batch, seq_len] q1_word, q2_word, q1_char, q2_char = data.q1_word, data.q2_word, data.q1_char, data.q2_char q1_word_embed = word_embedder(q1_word) # [batch, seq_len, 300] q2_word_embed = word_embedder(q2_word) q1_char_embed = char_embedder(q1_char) # [batch, seq_len, 300] q2_char_embed = char_embedder(q2_char) batch = q1_word_embed.size(0) q1_word_flat = q1_word.view(-1) # [batch * seq_len] q2_word_flat = q2_word.view(-1) q1_char_flat = q1_char.view(-1) q2_char_flat = q2_char.view(-1) q1_word_idfs = word_idf.index_select(0, index=q1_word_flat).view( batch, -1) # [batch, seq_len] q2_word_idfs = word_idf.index_select(0, index=q2_word_flat).view( batch, -1) q1_char_idfs = char_idf.index_select(0, index=q1_char_flat).view( batch, -1) q2_char_idfs = char_idf.index_select(0, index=q2_char_flat).view( batch, -1) # q1_word_idfs = F.softmax(q1_word_idfs, dim=1).unsqueeze(-1) # [batch, seq_len, 1] # q2_word_idfs = F.softmax(q2_word_idfs, dim=1).unsqueeze(-1) # q1_char_idfs = F.softmax(q1_char_idfs, dim=1).unsqueeze(-1) # q2_char_idfs = F.softmax(q2_char_idfs, dim=1).unsqueeze(-1) q1_word_idfs = (q1_word_idfs / q1_word_idfs.sum(dim=1, keepdim=True)).unsqueeze( -1) # [batch, seq_len, 1] q2_word_idfs = (q2_word_idfs / q2_word_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1) q1_char_idfs = (q1_char_idfs / q1_char_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1) q2_char_idfs = (q2_char_idfs / q2_char_idfs.sum(dim=1, keepdim=True)).unsqueeze(-1) q1_word_repre = torch.bmm(q1_word_embed.transpose(1, 2), q1_word_idfs).squeeze() # [batch, 300] q2_word_repre = torch.bmm(q2_word_embed.transpose(1, 2), q2_word_idfs).squeeze() q1_char_repre = torch.bmm(q1_char_embed.transpose(1, 2), q1_char_idfs).squeeze() q2_char_repre = torch.bmm(q2_char_embed.transpose(1, 2), q2_char_idfs).squeeze() word_cos_sim = F.cosine_similarity(q1_word_repre, q2_word_repre, dim=1) char_cos_sim = F.cosine_similarity(q1_char_repre, q2_char_repre, dim=1) word_l1_norm = torch.norm(q1_word_repre - q2_word_repre, p=1, dim=-1) char_l1_norm = torch.norm(q1_char_repre - q2_char_repre, p=1, dim=-1) word_l2_norm = torch.norm(q1_word_repre - q2_word_repre, p=2, dim=-1) char_l2_norm = torch.norm(q1_char_repre - q2_char_repre, p=2, dim=-1) word_similarity.append(word_cos_sim) char_similarity.append(char_cos_sim) word_l1.append(word_l1_norm) char_l1.append(char_l1_norm) word_l2.append(word_l2_norm) char_l2.append(char_l2_norm) word_similarity = torch.cat(word_similarity).cpu().numpy() char_similarity = torch.cat(char_similarity).cpu().numpy() word_l1 = torch.cat(word_l1).cpu().numpy() char_l1 = torch.cat(char_l1).cpu().numpy() word_l2 = torch.cat(word_l2).cpu().numpy() char_l2 = torch.cat(char_l2).cpu().numpy() rt = pd.DataFrame({ 'word_wmd': word_wmd, 'word_wv_cos_sim': word_similarity, 'word_wv_l1': word_l1, 'word_wv_l2': word_l2, 'char_wmd': char_wmd, 'char_wv_cos_sim': char_similarity, 'char_wv_l1': char_l1, 'char_wv_l2': char_l2 }) return rt
def __init__(self, args): path = '.data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' print("preprocessing data files...") if not os.path.exists(f'{path}/{args.train_file}l'): self.preprocess_file(f'{path}/{args.train_file}') if not os.path.exists(f'{path}/{args.dev_file}l'): self.preprocess_file(f'{path}/{args.dev_file}') self.RAW = data.RawField() self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = {'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)]} list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.train, self.dev = data.TabularDataset.splits( path=path, train=f'{args.train_file}l', validation=f'{args.dev_file}l', format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) #cut too long context in the training set for efficiency. if args.context_threshold > 0: self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold] print("building vocab...") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=FastText(language='ja')) print("building iterators...") self.train_iter, self.dev_iter = \ data.BucketIterator.splits((self.train, self.dev), batch_sizes=[args.train_batch_size, args.dev_batch_size], device=args.gpu, sort_key=lambda x: len(x.c_word))
def __init__(self, path, train_file, dev_file, vocab_max_size, train_samples, dev_samples, train_batch_size, dev_batch_size, word_dim=100, glove_tokens='840B'): #path = '..data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' self.train_file = train_file self.dev_file = dev_file self.context_threshold = 400 self.word_dim = word_dim self.gpu = 0 self.train_batch_size = train_batch_size self.dev_batch_size = dev_batch_size self.train_samples = train_samples self.dev_samples = dev_samples self.glove_tokens = glove_tokens print("preprocessing data files...") if not os.path.exists(path + '/' + self.train_file + 'l'): self.preprocess_file(path + '/' + self.train_file, self.train_samples) if not os.path.exists(path + '/' + self.dev_file + 'l'): self.preprocess_file(path + '/' + self.dev_file, self.dev_samples) self.RAW = data.RawField() # explicit declaration for torchtext compatibility self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.train, self.dev = data.TabularDataset.splits( path=path, train=f'{self.train_file}l', validation=f'{self.dev_file}l', format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) #cut too long context in the training set for efficiency. if self.context_threshold > 0: self.train.examples = [ e for e in self.train.examples if len(e.c_word) <= self.context_threshold ] print("building vocab...") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, max_size=vocab_max_size, vectors=GloVe(name=self.glove_tokens, dim=self.word_dim)) print("building iterators...") device = torch.device( f"cuda:{self.gpu}" if torch.cuda.is_available() else "cpu") self.train_iter, self.dev_iter = \ data.BucketIterator.splits((self.train, self.dev), batch_sizes=[self.train_batch_size, self.dev_batch_size], device=device, sort_key=lambda x: len(x.c_word))
f_text = data.Field(sequential=True, use_vocab=True) f_pos_tag = data.Field(sequential=True, use_vocab=False, pad_token=1, unk_token=0) f_lemma = data.Field(sequential=True, use_vocab=True) f_label = data.LabelField(tensor_type=torch.FloatTensor) fields = [('text', f_text), ('pos', f_pos_tag), ('lemma', f_lemma), ('label', f_label)] pipe = TwitterPipeline() full_examples = pipe.process_data(IN_FILE, fields)[0] # test_examples = pipe.process_data( # IN_FILE_TEST, fields)[0] full_ds = data.Dataset(full_examples, fields) # tst_ds = data.Dataset(test_examples, fields) # do the splitting for trn/val with torchtext trn_ds, val_ds, tst_ds = full_ds.split(split_ratio=[0.8, 0.1, 0.1], stratified=True, random_state=random.seed(SEED)) print(f'train len {len(trn_ds.examples)}') print(f'val len {len(val_ds.examples)}') print(f'test len {len(tst_ds.examples)}') vec = torchtext.vocab.Vectors('embed_tweets_de_100D_fasttext', cache='/Users/michel/Downloads/') # validation + test data should by no means influence the model, so build the vocab just on trn f_text.build_vocab(trn_ds, vectors=vec)
def load_data_cls_wo_valid(self, train_file, test_file=None, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: train_file (String): path to training file test_file (String): path to test file val_file (String): path to validation file ''' NLP = spacy.load('en_core_web_sm') tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "] # Creating Field for data # TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df = self.get_pandas_df(train_file) # train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()] train_test_examples = [] train_examples = [] for i in train_df.values.tolist(): label = i[1] text = i[0] text = text.split(' ') category = text[-5::] text_str = text[0:-5] text_str = (' ').join(text_str) text_str = clean_str(text_str) text_str = text_str.split(' ') text = ['<cls>'] + text_str + ['<sep>'] + category text = (' ').join(text) example = data.Example.fromlist([text, label], datafields) train_examples.append(example) train_test_examples.append(example) train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) # test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()] test_examples = [] for i in test_df.values.tolist(): label = i[1] text = i[0] text = text.split(' ') category = text[-5::] text_str = text[0:-5] text_str = (' ').join(text_str) text_str = clean_str(text_str) text_str = text_str.split(' ') text = ['<cls>'] + text_str + ['<sep>'] + category text = (' ').join(text) example = data.Example.fromlist([text, label], datafields) test_examples.append(example) train_test_examples.append(example) val_data = data.Dataset(test_examples, datafields) train_test_data = data.Dataset(train_test_examples, datafields) # # If validation file exists, load it. Otherwise get validation data from training data # if val_file: # val_df = self.get_pandas_df(val_file) # # val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()] # val_examples = [] # for i in val_df.values.tolist(): # label = i[1] # text = i[0] # text = text.split(' ') # text = ['<cls>'] + text[0:-5] + ['<sep>'] + text[-5::] # text = (' ').join(text) # example = data.Example.fromlist([text, label],datafields) # val_examples.append(example) # val_data = data.Dataset(val_examples, datafields) # else: # train_data, val_data = train_data.split(split_ratio=0.8) TEXT.build_vocab(train_test_data) self.vocab = TEXT.vocab # print('toprecreationclimbing' in TEXT.vocab.itos) self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator = data.BucketIterator( (val_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) # self.val_iterator, self.test_iterator = data.BucketIterator.splits( # (val_data, test_data), # batch_size=self.config.batch_size, # sort_key=lambda x: len(x.text), # repeat=False, # shuffle=False) print("Loaded {} training examples".format(len(train_data))) print("Loaded {} validation examples".format(len(val_data)))
def load_data(self, w2v_file, train_file, test_file, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: w2v_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec) train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' # NLP = spacy.load('en') # tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "] # Creating Field for data TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True) #, fix_length=self.config.max_sen_len) LABEL = data.LabelField(dtype=np.float32) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df = self.get_pandas_df(train_file) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) # If validation file exists, load it. Otherwise get validation data from training data if val_file: val_df = self.get_pandas_df(val_file) val_examples = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_examples, datafields) else: train_data, val_data = train_data.split(split_ratio=0.8) TEXT.build_vocab(train_data, vectors=Vectors(w2v_file)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Loaded {} training examples".format(len(train_data))) print("Loaded {} test examples".format(len(test_data))) print("Loaded {} validation examples".format(len(val_data)))
def main(): args = handleInputs() setRNGSeed(args.rng_seed) use_pyro = args.model_type is not 'nmt' if args.combine_results is not None: if os.path.isdir('./.results'): combineResults(args.combine_results) return else: ValueError( ".results/ does not exist, assumed no experiments previously ran" ) #create directory to store experiments if not os.path.isdir('./.results'): os.mkdir('./.results') #create directory for dataset source to target language pair exp_dir = './.results/{}_{}-{}/'.format(args.dataset, args.source, args.target) if not os.path.isdir(exp_dir): try: os.mkdir(exp_dir) except FileExistsError as e: logging.warning( "You might be trying to create {} twice (you running several runs?)" .format(exp_dir)) if use_pyro: args_name = 'kl-anneal_{}_{}_latents_{}_particles_{}_attn_{}/'.format( args.kl_anneal, args.to_anneal, args.z_dim, args.num_particles, args.use_attention) if args.use_flows: args_name = '{}_{}_'.format(args.flow_type, args.num_flows) + args_name exp_dir = exp_dir + '{}_'.format(args.model_type) + args_name else: exp_dir = exp_dir + 'RNNSearch/' #flag on whether this is an experiment continuation or not if args.opt == 'test' or args.opt == 'validate': #if we are test or validating, it is assumed the experiment was run 1st args.load_latest_epoch = True args.load_epoch = 1 args.load_latest_epoch = args.load_epoch >= 0 and args.load_latest_epoch cont_exp = args.load_epoch >= 0 or args.load_latest_epoch if not os.path.isdir(exp_dir): os.mkdir(exp_dir) else: #there's a logic gate for this...but can't remember what it is if not cont_exp: if not args.debug: raise ValueError( "{} already exists, if change other parameter, please rename existing file" .format(exp_dir)) #keep track of all parameters used log_file = exp_dir + 'experiment.log' init_logger(log_file, cont_exp) if cont_exp: logging.info( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logging.info( "load_epoch ({}) set. Loading exp config (seems silly otherwise)". format(args.load_epoch)) try: #to_pop is set to things we may want to actually update on the experiment. to_pop = [ "load_epoch", "epochs", "print_every", "decode_alg", "k", "length_norm", "load_latest_epoch", "opt", "bleu_score" ] args = load_args(args, exp_dir, to_pop=to_pop) except FileNotFoundError as e: logging.error( "could not load previous arguments, are you sure you put same parameters as experiment?" ) logging.error( "Starting experiment over and setting load_epoch = -1") args.load_epoch = -1 args.load_latest_epoch = False cont_exp = False #whether or not we loaded arguments, presumably should also make sure things are da same write_args(args, exp_dir) optimization_dict = get_optimization_dict(args) if args.use_bpe: logging.info("Using BPE models : {} -> {}".format( args.src_bpe, args.trg_bpe)) tokenize_src, tokenize_trg = getBPE(args.src_bpe, args.trg_bpe) else: logging.info("Using Tokenizer: {} -> {}".format( args.source, args.target)) tokenize_src, tokenize_trg = getTokenizer(args.source, args.target, args.on_whitespace) # we include lengths to provide to the RNNs data_save_path = './.data/{}_data_{}_to_{}.pth'.format( args.dataset, args.source, args.target) datahandler = DataHandler(tokenize_src, tokenize_trg, LOWER, EOS_TOKEN, SOS_TOKEN, PAD_TOKEN, UNK_TOKEN, args.min_freq, DEVICE) fields = [('src', datahandler.getSRCField()), ('trg', datahandler.getTRGField())] try: #TODO...figure out how to make this work if possible since...loading is expensive f = torch.load(data_save_path, pickle_module=dill) logging.info( 'found previous saved train and valid data, delete if undesired') datahandler.load_vocabs(f['src_vocab'], f['trg_vocab']) train_data = data.Dataset(f['train_examples'], fields=fields, filter_pred=None) valid_data = data.Dataset(f['valid_examples'], fields=fields, filter_pred=None) test_data = data.Dataset(f['test_examples'], fields=fields, filter_pred=None) except FileNotFoundError as e: logging.warning('could not find previous saved file, building new one') if args.dataset == 'tabular': logging.info("Using Tabular file, assumes no header in files") max_len = args.max_len train_data, valid_data, test_data = data.TabularDataset.splits( path='./.data/', format='tsv', train='train-{}-{}.tsv'.format(args.source, args.target), validation='dev-{}-{}.tsv'.format(args.source, args.target), test='test-{}-{}.tsv'.format(args.source, args.target), skip_header=False, fields=fields, filter_pred=lambda x: filter_fn(x, max_len)) elif args.dataset == 'IWSLT': logging.warning( "You need to create val.de-en.* and test.de-en.* by merging files before" ) train_data, valid_data, test_data = datasets.IWSLT.splits( exts=('.' + args.source, '.' + args.target), fields=(datahandler.getSRCField(), datahandler.getTRGField()), filter_pred=lambda x: filter_fn(x, args.max_len), validation='val', test='test') elif args.dataset == 'WMT14': train_data, valid_data, test_data = datasets.WMT14.splits( exts=('.' + args.source, '.' + args.target), fields=(datahandler.getSRCField(), datahandler.getTRGField())) datahandler.build_vocabs(train_data, args.custom_vocab_src, args.custom_vocab_trg) to_save = { 'train_examples': train_data.examples, 'valid_examples': valid_data.examples, 'test_examples': test_data.examples, 'src_vocab': datahandler.getSRCVocab(), 'trg_vocab': datahandler.getTRGVocab() } torch.save(to_save, data_save_path, pickle_module=dill) logging.info('Vocab Sizes: {} (SRC) {} (TRG)'.format( len(datahandler.getSRCVocab()), len(datahandler.getTRGVocab()))) logging.info('Train dataset Size: {}, Validation dataset Size: {}'.format( len(train_data), len(valid_data))) train_iter = datahandler.getBucketIter(train_data, batch_size=args.batch_size, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False) valid_iter = datahandler.getIter(valid_data, batch_size=1, train=False, sort=False, repeat=False) test_iter = datahandler.getIter(test_data, batch_size=1, train=False, sort=False, repeat=False) if args.use_bpe: trg_bpe = spm.SentencePieceProcessor() trg_bpe.Load(args.trg_bpe) src_bpe = spm.SentencePieceProcessor() src_bpe.Load(args.src_bpe) else: trg_bpe = None if args.bleu_score == 'raw': bleu_func = rawBLEU elif args.bleu_score == 'multi': bleu_func = get_moses_multi_bleu #this is where the magic starts (hopefully) modelfactory = ModelFactory(len(datahandler.getSRCVocab()), len(datahandler.getTRGVocab()), emb_size=args.emb_size, hidden_size=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout, z_layer=args.z_dim, pool_size=args.max_out_dim, use_projection=args.use_projection) model = modelfactory.getModel(args.model_type, use_attention=args.use_attention) cond_flow_scale = 2 if args.use_flows and args.model_type is not 'nmt': if args.flow_type == 'planar': model.loadPlanarFlows(args.num_flows, z_dim=args.z_dim) elif args.flow_type == 'iaf': model.loadIAFs(args.num_flows, z_dim=args.z_dim) elif args.flow_type == 'cond-planar': model.loadConditionalPlanarFlows(args.num_flows, args.hidden_size * cond_flow_scale, z_dim=args.z_dim) elif args.flow_type == 'cond-planar-v2': model.loadConditionalPlanarFlows_v2(args.num_flows, args.hidden_size * cond_flow_scale, z_dim=args.z_dim) elif args.flow_type == 'cond-iaf': model.loadConditionalIAFFlows(args.num_flows, args.hidden_size * cond_flow_scale, z_dim=args.z_dim) if not cont_exp: logging.info( "Initialializing Model parameters randomly with {} scheme".format( args.init_type)) model.initParameters(args.init_type) if not cont_exp: logging.info(model) if USE_CUDA: model = model.cuda() #some internal hacky stuff to let me do hacky things.... model.setTrainDataSize(len(train_data)) model.setUnkTokenIndex(datahandler.getTRGVocab().stoi[UNK_TOKEN]) model.setSOSTokenIndex( datahandler.getSRCVocab().stoi[SOS_TOKEN]) #for gnmt model.setPadIndex(datahandler.getSRCVocab().stoi[PAD_TOKEN]) model.setWordDropout(args.word_dropout) model.setUseMeanField("Mean" in args.elbo_type) model.setToAnneal(args.to_anneal) if 'q' not in args.to_anneal and "Mean" in args.elbo_type and args.kl_anneal > 1.0: msg = "You are not annealing the variational distribution even though you request to anneal and are using mean field...which would use analytic form and needs to anneal q" logging.warning(msg) print(msg) if args.model_pth is not None: #model.load('./model_final.pth') model.load(args.model_pth) train_translator = Translator( valid_data, valid_iter, model, max_len=args.max_len, sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN], eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN], pad_index=datahandler.getPadIndex(), use_cuda=USE_CUDA) trainer = Trainer(model, train_iter, valid_iter, use_pyro, datahandler.getPadIndex(), train_translator, bleu_func, datahandler.getTRGVocab(), bpe_model=trg_bpe, use_cuda=USE_CUDA, savedir=exp_dir, optim_dict=optimization_dict, kl_anneal=args.kl_anneal, use_aux_loss=args.use_aux_loss, load_epoch=args.load_epoch, use_latest_epoch=args.load_latest_epoch) if args.opt == 'all' or args.opt == 'train': dev_perplexities = trainer.train(num_epochs=args.epochs, print_every=args.print_every) torch.save(dev_perplexities, exp_dir + 'perplexities.pth') elif args.model_pth is None: # get best performing model logging.info("No model path provided, using best model for evaluation") dev_perplexities = trainer.initDevPerplexities() #if dev perplexities is not in order it was trained, this will not work best = {'i': -1, 'val_bleu': 0.0} for i, p in enumerate(dev_perplexities): cur_bleu = p['val_bleu'] if cur_bleu > best['val_bleu']: best['i'] = i best['val_bleu'] = cur_bleu args.model_pth = trainer.getCheckpointPth(best['i']) try: check_pt = torch.load(args.model_pth) model.load(check_pt['model']) #with mutation...this is probably not necessary, but just in case.... trainer.setModel(model) except Exception as e: logging.warning( "Failed to load a model...you do know you request to evaluate right?" ) else: model.load(args.model_pth) val_or_test = args.opt == 'all' or args.opt == 'validate' or args.opt == 'test' or args.opt == 'test_lengths' if val_or_test: if args.opt == 'test' or args.opt == 'test_lengths': dataset = test_data data_iter = test_iter else: dataset = valid_data data_iter = valid_iter scores = {} debug = True if val_or_test and use_pyro and debug: #Test utility of latent variable #Another way to see how useful z is to 0 it out at translation time. That way, it gets no weight #This sort of test only makes sense if z is concatentaed as input at each step of decoding model.setUseLatent(False) translator = Translator( dataset, data_iter, model, max_len=args.max_len, sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN], eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN], pad_index=datahandler.getPadIndex(), use_cuda=USE_CUDA, k=args.k, length_norm=args.length_norm) no_latent_bleu, hypotheses, references = translator.FullEvalTranslate( datahandler.getTRGVocab(), bleu_func, decodefn=args.decode_alg, bpe_model=trg_bpe) #store information no_latent_name = exp_dir + 'no-latent-{}.tsv'.format(args.opt) write_translations(no_latent_name, hypotheses, references) scores['{}-no_latent'.format(args.opt)] = no_latent_bleu #subtle, but remember we need to use it after this test model.setUseLatent(True) #TODO: Probably not gonna do this...but presumably, because of mutation..., I really don't need to make another one of these... #Do this after the no latent test, because the Translator at this point can be used below for testing lengths if debug: translator = Translator( dataset, data_iter, model, max_len=args.max_len, sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN], eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN], pad_index=datahandler.getPadIndex(), use_cuda=USE_CUDA, k=args.k, length_norm=args.length_norm) bleu, hypotheses, references = translator.FullEvalTranslate( datahandler.getTRGVocab(), bleu_func, decodefn=args.decode_alg, bpe_model=trg_bpe) logging.info("{} BLEU score: {} which was ran using {} opt".format( args.bleu_score, bleu, args.opt)) scores[args.opt] = bleu translation_name = exp_dir + '{}.tsv'.format(args.opt) write_translations(translation_name, hypotheses, references) joint_modeling = isinstance(model, GenerativeEncoderDecoder) or isinstance( model, VanillaJointEncoderDecoder) if joint_modeling and debug: model.setDecodeTarget(False) lm_translator = Translator( dataset, data_iter, model, max_len=args.max_len, sos_index=datahandler.getSRCVocab().stoi[SOS_TOKEN], eos_index=datahandler.getSRCVocab().stoi[EOS_TOKEN], pad_index=datahandler.getPadIndex(), use_cuda=USE_CUDA, k=args.k, length_norm=args.length_norm, do_lang_model=True) #Do greedy decoding only for language model. With these parameters, performance isn't expected to be tooo amazing bleu, hypotheses, references = lm_translator.FullEvalTranslate( datahandler.getSRCVocab(), bleu_func, decodefn='greedy', bpe_model=src_bpe) scores["lm-{}".format(args.opt)] = bleu translation_name = exp_dir + 'lm-{}.tsv'.format(args.opt) write_translations(translation_name, hypotheses, references) #collect validation "perplexity" for models, mostly for the ELBO if joint_modeling and debug: def get_lm_toks(): return trainer.model.getSRCTokCount() eval_perplexity = trainer.run_lvnmt_eval( trainer.rebatch_iter(data_iter), custom_tok_count=get_lm_toks, count_both=True) #calculate perplexity of language model model.setTrainMT(False) model.setTrainLM(True) lm_eval_perplexity = trainer.run_lvnmt_eval( trainer.rebatch_iter(data_iter), custom_tok_count=get_lm_toks) torch.save(lm_eval_perplexity, exp_dir + '{}-lm_perplexity.pth'.format(args.opt)) else: eval_perplexity = trainer.run_lvnmt_eval( trainer.rebatch_iter(data_iter)) torch.save(eval_perplexity, exp_dir + '{}-eval_perplexity.pth'.format(args.opt)) flow_samples = generate_flow_samples(trainer.model, trainer.rebatch_iter(data_iter), datahandler.getSRCVocab(), datahandler.getTRGVocab(), src_bpe=src_bpe, trg_bpe=trg_bpe) torch.save(flow_samples, exp_dir + '{}-latent_spaces.pth'.format(args.opt)) try: with open(exp_dir + 'bleus-{}.json'.format(args.opt), 'r') as bleu_scores: prev_bleus = json.load(bleu_scores) except Exception as e: prev_bleus = {} with open(exp_dir + 'bleus-{}.json'.format(args.opt), 'w') as bleu_scores: prev_bleus[len(prev_bleus)] = scores json.dump(prev_bleus, bleu_scores) if args.opt == 'test_lengths': logging.info("Calculating BLEU score based on sentence lengths") BLEUS = {} for length in range(5, 70, 5): references_of_length = [] hypotheses_of_length = [] #TODO this is stupidly inefficient... sort the ref - hypo pairs for i in range(len(references)): count = len(references[i].split()) if (length - 4) <= count and count <= length: references_of_length.append(references[i]) hypotheses_of_length.append(hypotheses[i]) bleu = [bleu_func(hypotheses_of_length, references_of_length)] BLEUS['length={}'.format(length)] = bleu save_name = exp_dir + args.model_pth.split( '/')[-1] + "_lengths.tsv" pd.DataFrame.from_dict(BLEUS).to_csv(save_name, sep='\t', index=False) if args.opt == 'tuning': BLEUS = {} BLEUS_list = [] for i in range(0, args.epochs): load_pth = exp_dir + 'checkpoints/epoch_{}.pth'.format(i) model.load(load_pth) translator = Translator( valid_data, valid_iter, model, max_len=60, sos_index=datahandler.getTRGVocab().stoi[SOS_TOKEN], eos_index=datahandler.getTRGVocab().stoi[EOS_TOKEN], pad_index=datahandler.getPadIndex(), use_cuda=USE_CUDA) bleu, hypotheses, references = translator.FullEvalTranslate( datahandler.getTRGVocab(), bleu_func, decodefn='greedy', bpe_model=trg_bpe) BLEUS['epoch_{}'.format(i)] = [bleu] BLEUS_list.append(bleu) logging.info(load_pth) logging.info('{} BLEU score {}'.format(args.bleu_score, bleu)) logging.info("Best model for {} was {} with {} BLEU: {}".format( exp_dir, np.argmax(BLEUS_list), args.bleu_score, max(BLEUS_list))) save_name = exp_dir + "BLEU_scores.tsv" pd.DataFrame.from_dict(BLEUS).to_csv(save_name, sep='\t', index=False)
def load_my_data(self, word_embedding_pkl, pairs_pkl): """ Loads the data from file :param word_embedding_pkl: absolute path to word_embeddings {Glove/Word2Vec} :param pairs_pkl: # pkl file save data :param context_flag: # 0: bairly include pairs # 1: include pairs and local context # 2: include pairs and global context # 3: include pairs, local context and global context :return: """ tokenizer = lambda text: [x for x in text] TEXT = data.Field(sequential=True, tokenize=tokenizer, fix_length=self.config.max_sen_len) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df, test_df, val_df = self.get_my_pandas_df( pairs_pkl, self.config.context_flag) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) val_examples = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_examples, datafields) TEXT.build_vocab(train_data, vectors=Vectors(name=word_embedding_pkl)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) print('Loaded %d training example' % len(train_data)) print('Loaded %d test example ' % len(test_data)) print('Loaded %d validation examples' % len(val_data))
def process_data(hparams, predict_sentences=None): train_fp, dev_fp, test_fp = hparams.train_fp, hparams.dev_fp, hparams.test_fp hparams.bos_token_id, hparams.eos_token_id = 101, 102 do_lower_case = 'uncased' in hparams.model_str tokenizer = AutoTokenizer.from_pretrained( hparams.model_str, do_lower_case=do_lower_case, use_fast=True, data_dir='data/pretrained_cache', add_special_tokens=False, additional_special_tokens=['[unused1]', '[unused2]', '[unused3]']) nlp = spacy.load("en_core_web_sm") pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) TEXT = data.Field(use_vocab=False, batch_first=True, pad_token=pad_index) WORD_STARTS = data.Field(use_vocab=False, batch_first=True, pad_token=0) POS = data.Field(use_vocab=False, batch_first=True, pad_token=0) POS_INDEX = data.Field(use_vocab=False, batch_first=True, pad_token=0) VERB = data.Field(use_vocab=False, batch_first=True, pad_token=0) VERB_INDEX = data.Field(use_vocab=False, batch_first=True, pad_token=0) META_DATA = data.Field(sequential=False) VERB_WORDS = data.Field(sequential=False) POS_WORDS = data.Field(sequential=False) LABELS = data.NestedField(data.Field(use_vocab=False, batch_first=True, pad_token=-100), use_vocab=False) fields = { 'text': ('text', TEXT), 'labels': ('labels', LABELS), 'word_starts': ('word_starts', WORD_STARTS), 'meta_data': ('meta_data', META_DATA) } if 'predict' not in hparams.mode: fields['pos'] = ('pos', POS) fields['pos_index'] = ('pos_index', POS_INDEX) fields['verb'] = ('verb', VERB) fields['verb_index'] = ('verb_index', VERB_INDEX) if hparams.task == 'oie': label_dict = { 'NONE': 0, 'ARG1': 1, 'REL': 2, 'ARG2': 3, 'LOC': 4, 'TIME': 4, 'TYPE': 5, 'ARGS': 3 } else: # hparams.task == 'conj': label_dict = { 'CP_START': 2, 'CP': 1, 'CC': 3, 'SEP': 4, 'OTHERS': 5, 'NONE': 0 } cached_train_fp, cached_dev_fp, cached_test_fp = f'{train_fp}.{hparams.model_str.replace("/","_")}.pkl', f'{dev_fp}.{hparams.model_str.replace("/","_")}.pkl', f'{test_fp}.{hparams.model_str.replace("/","_")}.pkl' all_sentences = [] if 'predict' in hparams.mode: # no caching used in predict mode if predict_sentences == None: # predict if hparams.inp != None: predict_f = open(hparams.inp, 'r') else: predict_f = open(hparams.predict_fp, 'r') predict_lines = predict_f.readlines() fullstops = [] predict_sentences = [] for line in predict_lines: # Normalize the quotes - similar to that in training data line = line.replace('’', '\'') line = line.replace('”', '\'\'') line = line.replace('“', '\'\'') tokenized_line = line.split() predict_sentences.append(' '.join(tokenized_line) + ' [unused1] [unused2] [unused3]') predict_sentences.append('\n') predict_examples, all_sentences = _process_data( predict_sentences, hparams, fields, tokenizer, label_dict, None) META_DATA.build_vocab( data.Dataset(predict_examples, fields=fields.values())) predict_dataset = [(len(ex.text), idx, ex, fields) for idx, ex in enumerate(predict_examples)] train_dataset, dev_dataset, test_dataset = predict_dataset, predict_dataset, predict_dataset else: if not os.path.exists(cached_train_fp) or hparams.build_cache: train_examples, _ = _process_data(train_fp, hparams, fields, tokenizer, label_dict, nlp) pickle.dump(train_examples, open(cached_train_fp, 'wb')) else: train_examples = pickle.load(open(cached_train_fp, 'rb')) if not os.path.exists(cached_dev_fp) or hparams.build_cache: dev_examples, _ = _process_data(dev_fp, hparams, fields, tokenizer, label_dict, nlp) pickle.dump(dev_examples, open(cached_dev_fp, 'wb')) else: dev_examples = pickle.load(open(cached_dev_fp, 'rb')) if not os.path.exists(cached_test_fp) or hparams.build_cache: test_examples, _ = _process_data(test_fp, hparams, fields, tokenizer, label_dict, nlp) pickle.dump(test_examples, open(cached_test_fp, 'wb')) else: test_examples = pickle.load(open(cached_test_fp, 'rb')) META_DATA.build_vocab( data.Dataset(train_examples, fields=fields.values()), data.Dataset(dev_examples, fields=fields.values()), data.Dataset(test_examples, fields=fields.values())) train_dataset = [(len(ex.text), idx, ex, fields) for idx, ex in enumerate(train_examples)] dev_dataset = [(len(ex.text), idx, ex, fields) for idx, ex in enumerate(dev_examples)] test_dataset = [(len(ex.text), idx, ex, fields) for idx, ex in enumerate(test_examples)] train_dataset.sort() # to simulate bucket sort (along with pad_data) return train_dataset, dev_dataset, test_dataset, META_DATA.vocab, all_sentences
def load_data(self, dataset, more=False, examples=None, already_read=True): print("Preparing Data Loaders") self.sentence_field = data.Field( sequential=True, use_vocab=True, init_token='<BOS>', eos_token='<EOS>', #function to preprocess preprocessing=data.Pipeline(convert_token=preprocess), tensor_type=torch.LongTensor, lower=True, tokenize='spacy') fields = [('text', self.sentence_field)] if not already_read: datapath = None trainpath, validpath, testpath = None, None, None if dataset == 'wikitext': datapath = WIKI_PATH paths = [datapath + 'wiki.' + s + '.tokens' for s \ in ['train', 'valid', 'test']] trainpath, validpath, testpath = paths[0], paths[1], paths[2] elif dataset == 'ptb': datapath = PTB_PATH paths = [ datapath + s + '.txt' for s in ['train', 'valid', 'test'] ] trainpath, validpath, testpath = paths[0], paths[1], paths[2] elif dataset == 'gigaword': datapath = GIGA_PATH trainpath = datapath + 'thread5.txt' elif dataset == 'gigasmall': datapath = GIGA_PATH trainpath = datapath + 'gigaword_small_train.txt' validpath = datapath + 'gigaword_small_val.txt' testpath = datapath + 'gigaword_small_test.txt' elif dataset == 'reviews': trainpath = 'data/reviews/reviews.txt' print("Retrieving Train Data from file: {}...".format(trainpath)) start = time() self.train_sentences = datasets.LanguageModelingDataset(trainpath,\ self.sentence_field, newline_eos = False) finish = time() - start print("Downloaded in {} minutes".format(finish / 60)) print("Got Train Dataset with {n_tokens} words".format(n_tokens =\ len(self.train_sentences.examples[0].text))) if validpath is not None: print( "Retrieving Valid Data from file: {}...".format(validpath)) self.valid_sentences = datasets.LanguageModelingDataset(validpath,\ self.sentence_field, newline_eos = False) else: self.valid_sentences = None if testpath is not None: print("Retrieving Test Data from file: {}...".format(testpath)) self.test_sentences = datasets.LanguageModelingDataset(testpath,\ self.sentence_field, newline_eos = False) else: self.test_sentences = None elif more: if examples is None: examples = [] already_split = False if already_split: for i, fold in enumerate(dataset): print('Reading fold:{}'.format(i)) pid = os.getpid() py = psutil.Process(pid) memoryUse = py.memory_info( )[0] / 2.**30 # memory use in GB...I think print('memory use:', memoryUse) examples.append(data.Example.fromlist([fold], fields)) self.train_sentences = data.Dataset(examples, fields) self.sentence_field.build_vocab(self.train_sentences) return self.sentence_field.freqs ''' print("EXAMPLES") print(len(examples)) print([ex.text[:100] for ex in examples]) print("EXAMPLES END") ''' else: examples.append(data.Example.fromlist( [dataset], fields)) #[i*fold_size:(i+1)*fold_size one = time() #print("READ EXAMPLES IN {}".format(one - start)) self.train_sentences = data.Dataset(examples, fields)
def get_dataset(load_data, text_field, label_field): fields = [('text', text_field), ('label', label_field)] examples = [] for data in load_data: content = words_after_jieba(data['content']) label = trans_labels(data['label']) examples.append(Data.Example.fromlist([content, label], fields)) return examples, fields train_examples, train_fields = get_dataset(load_train, TEXT, LABEL) valid_examples, valid_fields = get_dataset(load_valid, TEXT, LABEL) test_examples, test_fields = get_dataset(load_test, TEXT, LABEL) train_set = Data.Dataset(train_examples, train_fields) valid_set = Data.Dataset(valid_examples, valid_fields) test_set = Data.Dataset(test_examples, test_fields) # 根据词向量建立词表 TEXT.build_vocab(train_set, vectors=Vectors('w2vformat.txt')) LABEL.build_vocab(train_set) train_iter = Data.BucketIterator(train_set, batch_size, sort=False, device=device) valid_iter = Data.BucketIterator(valid_set, batch_size, sort=False, device=device) test_iter = Data.BucketIterator(test_set, batch_size, sort=False, device=device) # print(next(iter(train_iter)).text.shape) # print(next(iter(train_iter)).label.shape) # %% [markdown]
def __init__(self, config): # logger self.logger = logging.getLogger('MC') # params self.config = config["data_loader"]["args"] # set path (for raw data) data_path = self.config["data_path"] # get data_path_l (for processed data (.jsonl and .pt)) if "search" in self.config["train_file"]: data_path_process = os.path.join(data_path, "search") elif "zhidao" in self.config["train_file"]: data_path_process = os.path.join(data_path, "zhidao") else: raise Exception("not supported data set now!") data_path_process = os.path.join(data_path_process, self.config["process_info"]) ensure_dir(data_path_process) # (for .pt)ls processed_dataset_path = data_path_process + "/torchtext/" train_examples_path = processed_dataset_path + f'{self.config["train_file"]}.pt' dev_examples_path = processed_dataset_path + f'{self.config["dev_file"]}.pt' test_examples_path = processed_dataset_path + f'{self.config["test_file"]}.pt' # define Field self.logger.info("construct data loader....") self.RAW = data.RawField() self.RAW.is_target = False # 读取id值 self.Q_WORD = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=lambda x: x, lower=False, include_lengths=True) self.T_WORD = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=lambda x: x, lower=False, include_lengths=False) # for multi para [b, para_num, seq_len] or [b, para_num, seq_len, w_len] self.PARAS = data.NestedField(self.T_WORD, use_vocab=True, tokenize=lambda x: x, include_lengths=True) self.LABEL = data.Field(sequential=False, use_vocab=False, unk_token=None) self.ALL_LABELS = data.NestedField(self.LABEL, use_vocab=False, pad_token=0, dtype=torch.long) dict_fields = { 'question_id': ('id', self.RAW), 'question': ('q_word', self.Q_WORD), 'question_type': ('question_type', self.RAW), 'yesno_answers': ('yesno_answers', self.RAW), 'paragraphs': ('paras_word', self.PARAS), 's_idxs': ('s_idxs', self.ALL_LABELS), 'e_idxs': ('e_idxs', self.ALL_LABELS), 'answer_para_idxs': ('answer_para_idxs', self.ALL_LABELS), 'match_scores': ('match_scores', self.RAW) } list_fields = [('id', self.RAW), ('q_word', self.Q_WORD), ('question_type', self.RAW), ('yesno_answers', self.RAW), ('paras_word', self.PARAS), ('s_idxs', self.ALL_LABELS), ('e_idxs', self.ALL_LABELS), ('answer_para_idxs', self.ALL_LABELS), ('match_scores', self.RAW)] test_dict_fields = { 'question_id': ('id', self.RAW), 'question': ('q_word', self.Q_WORD), 'question_type': ('question_type', self.RAW), 'yesno_answers': ('yesno_answers', self.RAW), 'paragraphs': ('paras_word', self.PARAS), } test_list_fields = [ ('id', self.RAW), ('q_word', self.Q_WORD), ('question_type', self.RAW), ('yesno_answers', self.RAW), ('paras_word', self.PARAS), ] # judge if need to build dataSet if not os.path.exists(train_examples_path) or not os.path.exists( dev_examples_path): self.logger.info("build train dataSet....") self.train, self.dev = data.TabularDataset.splits( path=f'{data_path_process}', train=f'{self.config["train_file"]}l', validation=f'{self.config["dev_file"]}l', format='json', fields=dict_fields) # save preprocessed data ensure_dir(processed_dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) else: self.logger.info("loading train dataSet.....") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) # for test data if not os.path.exists(test_examples_path): self.logger.info("build test dataSet....") self.test = data.TabularDataset( path=f'{data_path_process}/{self.config["test_file"]}l', format='json', fields=test_dict_fields) # save preprocessed data ensure_dir(processed_dataset_path) torch.save(self.test.examples, test_examples_path) else: self.logger.info("loading test dataSet......") test_examples = torch.load(test_examples_path) self.test = data.Dataset(examples=test_examples, fields=test_list_fields) # build vocab # vocab_cache_path = f"{data_path}/{self.config['vocab_cache']}" # if not os.path.exists(vocab_cache_path): self.logger.info("build vocab....") # self.CHAR.build_vocab(self.train, self.dev) self.PARAS.build_vocab(self.train.paras_word, self.train.q_word, self.dev.paras_word, self.dev.q_word) self.Q_WORD.vocab = self.PARAS.vocab # load pretrained embeddings Vectors = vocab.Vectors(self.config["pretrain_emd_file"]) self.PARAS.vocab.load_vectors(Vectors) # # save vocab cache # self.logger.info("save vocab....") # with open(vocab_cache_path, 'wb') as fout: # pickle.dump(self.PARAS.vocab, fout) # else: # # load vocab # self.logger.info(f"load vocab from {vocab_cache_path} ....") # with open(vocab_cache_path, 'rb') as fin: # self.PARAS.vocab = pickle.load(fin) # self.WORD.vocab = self.PARAS.vocab # self.Q_WORD.vocab = self.PARAS.vocab # just for call easy self.vocab_vectors = self.PARAS.vocab.vectors self.vocab = self.PARAS.vocab # build iterators self.logger.info("building iterators....") self.train_iter = data.BucketIterator( dataset=self.train, batch_size=self.config["train_batch_size"], device=self.config["device"], shuffle=True) self.eval_iter = data.BucketIterator( dataset=self.dev, batch_size=self.config["dev_batch_size"], device=self.config["device"], sort_key=lambda x: max( [max(para_len) for para_len in x.paras_word[2]]), sort_within_batch=False, shuffle=False) self.test_iter = data.BucketIterator( dataset=self.test, batch_size=self.config["dev_batch_size"], sort_key=lambda x: max( [max(para_len) for para_len in x.paras_word[2]]), sort_within_batch=False, device=self.config["device"], shuffle=False)
def read_one(self, data_file, dataset_type="train"): pkl_data = pickle.load(Path(data_file).open('rb')) examples = [Example.fromdict(x, self.fields1) for x in pkl_data] dataset = data.Dataset(examples, fields=self.fields2) return dataset
def load_data(self, w2v_file, train_file, test_file, val_file=None): ''' 从文件中读取数据,建立 iterators、vocabulary 和 embeddings Inputs: w2v_file(String): 预训练的词向量文件(Glove/Word2Vec) train_file(String): 训练数据路径 test_file(String): 测试数据路径 val_file(String): 验证数据路径 ''' NLP = spacy.load('en') tokenizer = lambda sent: [ x.text for x in NLP.tokenizer(sent) if x.text != " " ] # 创建 Field 对象 TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) # LABEL 中的 sequential 一定要设置为 False LABEL = data.Field( sequential=False, use_vocab=False ) # 如果LABEL是整型,不需要 numericalize , 就需要将 use_vocab=False datafields = [("text", TEXT), ("label", LABEL)] # 将 DataFrame 中的数据添加到 torchtext.data.Dataset 中 train_df = self.get_pandas_df(train_file) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] # 生成训练样本 train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] # 生成测试样本 text_data = data.Dataset(test_examples, datafields test_data = data.Dataset(test_examples, datafields) # 划分验证集 if val_file: val_df = self.get_pandas_df(val_file) val_example = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_example, datafields) else: train_data, val_data = train_data.split( split_ratio=0.8) # 利用 split 划分 # 加载预训练的 word embedding TEXT.build_vocab(train_data, vectors=Vectors(w2v_file)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab # 生成训练数据迭代对象 self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) # 生成测试数据和验证数据的迭代对象 self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Local {} train examples".format(len(train_data))) print("Local {} test examples".format(len(test_data))) print("Local {} validation examples".format(len(val_data)))
vectors = vocab.Vectors(name='glove.840B.300d.txt', cache='content/drive/') id.build_vocab(dataset) label.build_vocab(dataset) sent.build_vocab(dataset, vectors=vectors) #[protects the non-nested fields(.py#629) from flattening ] embedding_vectors = sent.vocab.vectors vocab_size = len(sent.vocab) train = np.array(dataset.examples[:87170]) valid = np.array(dataset.examples[87170:95239]) test = np.array(dataset.examples[95239:]) train_ds = data.Dataset(train, fields) valid_ds = data.Dataset(valid, fields) test_ds = data.Dataset(test, fields) train_loader = BucketIterator( train_ds, train=True, batch_size=200, shuffle=True, # sort_key=lambda x: x.id, # device=torch.device(0), ) valid_loader = BucketIterator( valid_ds, batch_size=200,
def load_data(embeddings=None, device='cpu', batch_size=32, bptt_len=35, path_to_data="../data", train="02-21.10way.clean", valid="22.auto.clean", test="23.auto.clean", bos_token='<bos>'): """ Function that loads and prepares the Penn Treebank data in two different ways. The first takes fixed length pieces of the entire training text. The second takes each sentence as is. Args: embeddings: the pre-trained word embeddings to use. batch_size: size of batches. bptt_len: the length of the sequences in the batches path_to_data: where the Penn Treebank data is located. train, valid, test: the files to use as train/valid/test. """ # Already tokenized so use identity function. TEXT = data.Field(lower=True, tokenize=lambda x: x) SENTENCE = data.Field(lower=True, tokenize=lambda x: x, include_lengths=True) lm_fields = [("text", TEXT)] s_fields = [("text", SENTENCE), ("target", SENTENCE)] print("Loading data...") # Extract sentences from files; turn into examples. splits_langmodel = [] splits_sentences = [] for f in [train, valid, test]: path = os.path.join(path_to_data, f) # Remove POS tags and concatenate into one list for language modelling. nr_lines = 0 total_tokens = 0 lm_example = [] s_examples = [] with io.open(path, encoding='utf-8') as f: for line in f: nr_lines += 1 # remove POS tags and tree structure. tokens = [bos_token] + re.sub(r"\([0-9] |\)", "", line).split() tokens = [ token for token in tokens if not token.startswith('(') ] total_tokens += len(tokens) lm_example.extend(tokens) s_examples.append([tokens, tokens[1:] + [bos_token]]) avg_length = total_tokens / nr_lines print("Average Sentence Length: {}".format(avg_length)) # The language model datasets are one big Example with all sentences. lm_example = data.Example.fromlist([lm_example], lm_fields) dataset = data.Dataset([lm_example], lm_fields) splits_langmodel.append(dataset) # the sentence datasets contain each sentence as a separate Example. examples = [ data.Example.fromlist(example, s_fields) for example in s_examples ] dataset = data.Dataset(examples, s_fields) splits_sentences.append(dataset) print("Done loading.") # To reduce vocabulary to roughly 22.000 . MIN_FREQ = 2 specials = ['<unk>', '<pad>', bos_token] if embeddings: TEXT.build_vocab(*splits_langmodel, min_freq=MIN_FREQ, vectors=embeddings, specials=specials) else: TEXT.build_vocab(*splits_langmodel, min_freq=MIN_FREQ, specials=specials) # Use BPTTIterator for LM variant. train, valid, test = splits_langmodel lm_train_iter, lm_valid_iter, lm_test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=batch_size, bptt_len=bptt_len, shuffle=True, device=device) # Make validation/test fit in memory (multi-sample estimates required a bit more). VALID_TEST_BATCH_SIZE = 16 train, valid, test = splits_sentences s_train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: x, shuffle=True, sort=False, device=device) s_valid_iter, s_test_iter = data.BucketIterator.splits( (valid, test), batch_size=VALID_TEST_BATCH_SIZE, shuffle=True, sort=False, device=device) SENTENCE.vocab = TEXT.vocab return (lm_train_iter, lm_valid_iter, lm_test_iter, TEXT), \ (s_train_iter, s_valid_iter, s_test_iter, SENTENCE)
def preprocess(question, equation, sni_model, fields, use_sni=True): """ Returns preprocessed version of question and equation using sni_model and fields """ # handle %'s question = question.replace('%', ' % ') # handle fractions parser = Parser() fractions = re.findall('\(\d+\)/\(\d+\)', question) fractions = np.append(fractions, re.findall('\(\d+/\d+\)', question)) for i, fraction in enumerate(fractions): #question = question.replace(fraction, str(sys.maxsize - i)) #equation = equation.replace(fraction, str(sys.maxsize - i)) question = question.replace( fraction, str(parser.evaluate(fraction, variables=None))) equation = equation.replace( fraction, str(parser.evaluate(fraction, variables=None))) # handle numbers with units question = re.sub(r'(\d+)([A-z]{1,2})', r'\1 \2', question) # seperate equation at operators equation = equation.replace('[', ' ( ') equation = equation.replace(']', ' ) ') equation = equation.replace('+', ' + ') equation = equation.replace('+', ' + ') equation = equation.replace('-', ' - ') equation = equation.replace('*', ' * ') equation = equation.replace('/', ' / ') equation = equation.replace('(', ' ( ') equation = equation.replace(')', ' ) ') equation = equation.replace('=', ' = ') equation = equation.replace('^', ' ^ ') # reduce %'s #equation = equation.replace('%', ' / 100 ') equation = re.sub(r'(\d*.{0,1}\d+)%', r'(\1 / 100 )', equation) # preprocess question equation = equation.split() question = question.split() # prepend and postpend null tokens to question to allow for sni window size # of three question = ['null', 'null', 'null'] + question + ['null', 'null', 'null'] # prevent inplace changes on question question_copy = [t for t in question] #print('question_copy:', question_copy) # replace significant numbers in question and equation i = 0 variable_values = dict() for j, token in enumerate(question): if isFloat(token): example = question_copy[j - 3:j + 4] ex = data.Example.fromlist([' '.join(example), ''], fields) dataset = data.Dataset([ex], fields) inp = None iterator = data.Iterator(dataset, batch_size=1) iterator.repeat = False for batch in iterator: inp = batch.text.t() if (not use_sni) or (use_sni and isSignificant(inp, sni_model)): #if (use_sni and isSignificant(inp, sni_model)) or (not use_sni): for symbol in equation: if symbol == token: equation[equation.index(symbol)] = '[' + chr(97 + i) + ']' character = '[' + chr(97 + i) + ']' variable_values[character] = token for q in question: if q == token: question[question.index(q)] = '[' + chr(97 + i) + ']' i += 1 # remove pre/postpended null tokens from question question = question[3:-3] question = ' '.join(question) + '\n' equation = ' '.join(equation) + '\n' return question, equation, variable_values
rel_label += y_hat.tolist() net.train() # 更改为训练模式 with open('./test_rel.txt', 'w') as fp: for label in rel_label: label = idx_to_label[label] fp.writelines( str(label) + '\n') # In[36]: test_path = './snli.test' examples = [] fields = [('seq1',sequence), ('seq2', sequence)] with open(test_path, 'r') as fp: contents = fp.readlines() for content in contents: seqs = content.strip().split('|||') examples.append(data.Example.fromlist(seqs, fields)) test_dataset = data.Dataset(examples, fields) print(len(test_dataset)) test_iter = data.Iterator(test_dataset, batch_size= args.batch_size, shuffle = False) # In[37]: test(net, test_iter)
def repeat_augment_and_train(dir_to_save, iter_func, model_wrapper, data_source, aug_algo, encoder_model, sim_measure, datasets, text_field, label_field, frac, num_classes, classifier_params, k, learning_type): """ Runs k trials of augmentation & repeat-classification for a given fraction of labeled training data. Args: dir_to_save (str): directory to save models created/loaded during this process aug_algo (str): which augmentation algorithm to use encoder_model (str): encoder model to use for augmentation (w similarity measure between these encodings) sim_measure (str): which similarity measure to use datasets (list(Dataset)): train/val/test torchtext datasets text_field (Field): torchtext field for sentences label_field (LabelField): torchtext LabelField for class labels frac (float): Fraction of labeled training data to use classifier_params (dict): params for intent classifier to use on augmented data. k (int): Number of times to repeat augmentation-classifier training process learning_type (str): inductive|transductive Returns: 8 statistical measures of the results of these trials """ train_ds, val_ds, test_ds = datasets class_accs, aug_accs, aug_fracs = [], [], [] ps, rs, fs = [], [], [] # FOR ENTROPY HEURISTIC # mst_sigmas, entropies, sigmas, accs, fracs = [], [], [], [], [] # # ABLATION STUDY # sigmas, f1_means, f1_stds, aug_acc_means, aug_acc_stds, frac_used_means, frac_used_stds = [],[],[],[],[],[],[] # for sigma in np.arange(0.035, 0.155, 0.005): # sigmas.append(sigma) for i in tqdm(range(k), total=k): examples = train_ds.examples np.random.shuffle(examples) cutoff = int(frac*len(examples)) if learning_type == "transductive": labeled_examples = train_ds.examples unlabeled_examples = test_ds.examples elif frac == 0: # 1 labeled eg from each class classes_seen = {i: 0 for i in range(num_classes)} labeled_examples, unlabeled_examples = [], [] for eg in examples: if classes_seen[eg.y] == 0: labeled_examples.append(eg) classes_seen[eg.y] += 1 else: unlabeled_examples.append(eg) else: # at least one labeled eg from each class while True: labeled_examples = examples[:cutoff] unlabeled_examples = examples[cutoff:] if len(set([eg.y for eg in labeled_examples])) == num_classes: break np.random.shuffle(examples) ################################################################################################################## # PROPAGATION PROCESS VISUALISATION (FOR DEMO) # from matplotlib import pyplot as plt # from pandas import DataFrame # from sklearn.decomposition import PCA # from sklearn.manifold import TSNE # import matplotlib.transforms as transforms # # EXTRACT DATA & COMPUTE DIM_REDUCED EMBEDDINGS # pickle.dump(labeled_examples, Path(f'./paper/{frac}_labeled_egs.pkl').open('wb')) # pickle.dump(unlabeled_examples, Path(f'./paper/{frac}_unlabeled_egs.pkl').open('wb')) # labeled_examples = pickle.load(Path(f'./paper/{frac}_labeled_egs.pkl').open('rb')) # unlabeled_examples = pickle.load(Path(f'./paper/{frac}_unlabeled_egs.pkl').open('rb')) # intents = pickle.load(Path(f'./data/ic/{data_source}/intents.pkl').open('rb')) # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples) # x_l, y_l, x_u, y_u, _ = res # X = np.concatenate([x_l, x_u]) # Y = np.concatenate([y_l, y_u]) # pca = PCA(n_components=100) # pca_res = pca.fit_transform(X) # tsne = TSNE(n_components=2, verbose=0, perplexity=30, n_iter=1000) # tsne_pca_res = tsne.fit_transform(pca_res) # ts1, ts2 = tsne_pca_res[:,0], tsne_pca_res[:,1] # df_tsne_pca = DataFrame([{ # 'intent': intents[y], # 'x-tsne-pca': t1, # 'y-tsne-pca': t2, # 'og_idx': idx # } for idx, (y,t1,t2) in enumerate(zip(Y,ts1,ts2))]) # df_tsne_pca.to_pickle(f'./paper/{frac}_dataframe.pkl') # df_tsne_pca = pd.read_pickle(f'./paper/{frac}_dataframe.pkl') # # PLOT INITIAL DATASET # fig, ax = plt.subplots() # n_l = len(labeled_examples) # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # for i, v in enumerate(values): # if v[0] < n_l: # ax.scatter(v[1], v[2], color=f'C{idx}', s=100, alpha=1, label=intent) # else: # ax.scatter(v[1], v[2], color='black', s=100, alpha=0.2) # title = 'propagation_initial_labeled_only' # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # ax.scatter([v[1] for v in values], [v[2] for v in values], color=f'C{idx}', s=100, alpha=1, label=intent) # title = 'propagation_initial_all' # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/{title}.pdf', format='pdf', dpi=100) # plt.show() # assert(False) # # PRELIMINARY DATA FOR MAIN PLOT # dim_reduced_points = [0 for _ in range(100)] # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # for v in values: # dim_reduced_points[int(v[0])] = (v[1:],intent) # data = pickle.load(Path('./paper/propagation_data.pkl').open('rb')) # indices = pickle.load(Path('./paper/indices_data.pkl').open('rb')) # classifications = pickle.load(Path('./paper/classifications_data.pkl').open('rb')) # colors = {'findconnection': 'C1', 'departuretime': 'C0'} # intent_map = {0: 'findconnection', 1: 'departuretime'} # classified_indices = [0, 1] # classified_true_labels = ['findconnection', 'departuretime'] # classified_intents = ['findconnection', 'departuretime'] # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices] # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices] # # PLOT EACH RECURSION & PROPAGATION ITERATION # with plt.style.context('seaborn-whitegrid'): # plt.rcParams['font.family'] = 'serif' # plt.rcParams['mathtext.fontset'] = 'dejavuserif' # # starting point plot # title = '0_final' # fig, ax = plt.subplots() # unclassified_indices = [i for i in range(100) if i not in classified_indices] # unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices] # unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices] # ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, 'Recursion 0 -- complete', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # for recursion_idx, prop_data in tqdm(enumerate(data), total=len(data)): # # plot results during propagation # Y_us = [prop_data[0]] if len(prop_data) == 1 else np.array(prop_data)[range(0, len(prop_data), 100)] # for prop_idx, Y_u in enumerate(Y_us): # title = f'{recursion_idx+1}_{(prop_idx+1)*100}' # fig, ax = plt.subplots() # for idx, row in enumerate(Y_u): # color = colors[intent_map[np.argmax(row)]] # prob = np.max(row) # ax.scatter(unclassified_xs[idx], unclassified_ys[idx], color=color, s=100, alpha=prob*0.75) # for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]): # ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1) # if intent != true_label: # ax.scatter(x, y, color='black', marker='x', s=150, alpha=1) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, f'Recursion {recursion_idx+1} -- iterating...', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # # plot the end result of each recursion - i.e. new ground truth classifications # classified_indices += [i + 2 for i in indices[recursion_idx]] # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices] # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices] # classified_true_labels = [dim_reduced_points[i][1] for i in classified_indices] # classified_intents += [intent_map[intent_class] for intent_class in classifications[recursion_idx]] # unclassified_indices = [i for i in range(100) if i not in classified_indices] # unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices] # unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices] # title = f'{recursion_idx+1}_final' # fig, ax = plt.subplots() # ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2) # for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]): # ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1) # if intent != true_label: # ax.scatter(x, y, color='black', marker='x', s=150, alpha=1) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, f'Recursion {recursion_idx+1} -- complete', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # assert(False) ################################################################################################################## # # ENTROPY HEURISTIC # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples) # x_l, y_l, x_u, y_u, _ = res # mst_sigma, entropy, sigma, acc, frac_used = sigma_fit(x_l, y_l, x_u, y_u, num_classes, data_source) # mst_sigmas.append(mst_sigma); entropies.append(entropy); sigmas.append(sigma); accs.append(acc); fracs.append(frac_used) # continue if aug_algo == "eda": x_l, y_l = [eg.x for eg in labeled_examples], [eg.y for eg in labeled_examples] augmented_x_l, augmented_y_l = eda_corpus(x_l, y_l) new_labeled_data = [{'x': x, 'y': y} for x,y in zip(augmented_x_l, augmented_y_l)] augmented_train_examples = [Example.fromdict(x, {'x': ('x', text_field), 'y': ('y', label_field)}) for x in new_labeled_data] aug_acc = 1; frac_used = 0 elif aug_algo == "none": augmented_train_examples = labeled_examples aug_acc = 1; frac_used = 0 elif aug_algo == "self_feed": sf_thresh = 0.7 augmented_train_examples, aug_acc, frac_used = self_feed(data_source, dir_to_save, iter_func, model_wrapper, labeled_examples, unlabeled_examples, val_ds, test_ds, text_field, label_field, classifier_params, thresh=sf_thresh) else: augmented_train_examples, aug_acc, frac_used = augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None) aug_accs.append(aug_acc); aug_fracs.append(frac_used) new_train_ds = data.Dataset(augmented_train_examples, {'x': text_field, 'y': label_field}) new_datasets = (new_train_ds, val_ds, test_ds) if learning_type == "inductive": acc, p, r, f = do_basic_train_and_classify(new_train_ds, test_ds, classifier_params, data_source) else: # transductive predictions = [eg.y for eg in augmented_train_examples[len(train_ds.examples):]] test_Y = [eg.y for eg in test_ds.examples] acc = accuracy_score(predictions, test_Y) avg = "macro avg" if data_source == "chat" else "weighted avg" report = classification_report(predictions, test_Y, output_dict=True)[avg] p, r, f = report['precision'], report['recall'], report['f1-score'] class_accs.append(acc); ps.append(p); rs.append(r); fs.append(f) # # ENTROPY HEURISTIC # print(np.mean(entropies), np.std(entropies)) # print(np.mean(mst_sigmas), np.std(mst_sigmas)) # print(np.mean(sigmas), np.std(sigmas)) # print(np.mean(accs), np.std(accs)) # print(np.mean(fracs), np.std(fracs)) # assert(False) # # ABLATION STUDY # print(f"SIGMA: {sigma}") # f1_means.append(np.mean(class_accs)); f1_stds.append(np.std(class_accs)) # aug_acc_means.append(np.mean(aug_accs)); aug_acc_stds.append(np.std(aug_accs)) # frac_used_means.append(np.mean(aug_fracs)); frac_used_stds.append(np.std(aug_fracs)) # assert(False) print(f"FRAC '{frac}' Results Below:") print(f'classification acc --> mean: {np.mean(class_accs)}; std: {np.std(class_accs)}') print(f'augmentation acc --> mean: {np.mean(aug_accs)}; std: {np.std(aug_accs)}\t (average frac used: {np.mean(aug_fracs)})') print(f'p/r/f1 means --> precision mean: {np.mean(ps)}; recall mean: {np.mean(rs)}; f1 mean: {np.mean(fs)}') print(f'p/r/f1 stds --> precision std: {np.std(ps)}; recall std: {np.std(rs)}; f1 std: {np.std(fs)}') class_acc_mean, class_acc_std = np.mean(class_accs), np.std(class_accs) aug_acc_mean, aug_acc_std, aug_frac_mean = np.mean(aug_accs), np.std(aug_accs), np.mean(aug_fracs) p_mean, r_mean, f_mean = np.mean(ps), np.mean(rs), np.mean(fs) p_std, r_std, f_std = np.std(ps), np.std(rs), np.std(fs) # # ABLATION STUDY # print([round(s, 3) for s in sigmas]) # print(f1_means) # print(f1_stds) # print(aug_acc_means) # print(aug_acc_stds) # print(frac_used_means) # print(frac_used_stds) # assert(False) return class_acc_mean, class_acc_std, aug_acc_mean, aug_acc_std, aug_frac_mean, p_mean, p_std, r_mean, r_std, f_mean, f_std
def load_data(self, train_file, test_file=None, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: train_file (String): path to training file test_file (String): path to test file val_file (String): path to validation file ''' NLP = spacy.load('en_core_web_sm') tokenizer = lambda sent: [ x.text for x in NLP.tokenizer(sent) if x.text != " " ] # Creating Field for data TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df = self.get_pandas_df(train_file) # print(train_df) # exit() train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) # If validation file exists, load it. Otherwise get validation data from training data # train_data, val_data = train_data.split(split_ratio=0.9) TEXT.build_vocab(train_data) self.vocab = TEXT.vocab f = open("vocab.txt", "w") out = list(self.vocab.stoi.keys()) for line in out: f.write(line + '\n') f.close() self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator = data.BucketIterator( test_data, batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Loaded {} training examples".format(len(train_data))) print("Loaded {} test examples".format(len(test_data)))
def translate_list(self, src_sequences, show_progbar=True, n_jobs=1, debug=False): """ Given a list of sequences in the source language to translate to the target language, run them through the model and translate them. Parameters ---------- src_sequences: List of str A list of str sequences in the source language to translate to the target language. show_progbar: boolean, optional (default=True) Whether or not to show a progress bar during translation. Returns ------- target_sequences: List of str A list of str with the translation predictions for each sequence in src_sequences. """ self.model.eval() # Convert the list of src_sequences to a list of Examples. src_examples = [ data.Example.fromlist([src_sequence], [("source", self.source_field)]) for src_sequence in src_sequences ] # Instantiate a Dataset object src_dataset = data.Dataset(src_examples, {"source": self.source_field}) # Create batches on current GPU if CUDA is available, else CPU device = None if torch.cuda.is_available() else -1 # Create an iterator over the source data src_iter = data.Iterator(dataset=src_dataset, batch_size=self.batch_size, device=device, sort=False, repeat=False, shuffle=False, train=False) # Run the data through the model to predict translations all_predicted_indices = [] if show_progbar: predict_iter = tqdm(enumerate(src_iter), total=len(src_iter), file=sys.stdout, desc="Prediction Batches") else: predict_iter = enumerate(src_iter) for predict_batch_idx, predict_batch in predict_iter: # Sort the source data and lengths by length, and translate it. source_data, source_lengths = predict_batch.source source_lengths, sort_indices = torch.sort(source_lengths, -1, descending=True) source_data = Variable( source_data.data.gather(1, sort_indices.expand_as(source_data))) predicted_indices, _ = self._translate_batch( source_data, source_lengths) # Reverse the sorting we did for compatibility with the model # to restore the original input ordering _, reverse_sort_indices = torch.sort(sort_indices, -1) predicted_indices = Variable( predicted_indices.data.gather( 1, reverse_sort_indices.expand_as(predicted_indices))) # Originally shape (seq_len, batch_size) # Transpose to shape (batch_size, seq_len), and then split into # tuple of length batch_size, with each element of shape (1, seq_len) all_predicted_indices.extend( predicted_indices.transpose(0, 1).split(1)) self.model.train() # Convert the predicted indices to tokens with the target side vocab final_strings = [] for seq_predicted_indices in all_predicted_indices: final_string = [] for tok_idx in seq_predicted_indices.squeeze(0).data: if tok_idx == self.target_padding_idx or tok_idx == self.target_eos_idx: break final_string.append(self.target_field.vocab.itos[tok_idx]) if final_string[-1] == self.target_field.eos_token: final_strings.append(self._format_output(final_string[1:-1])) else: final_strings.append(self._format_output(final_string[1:])) return final_strings
def prep_data(fake_headlines, real_headlines, embedding_length): random.seed(0) split_ratio = 0.15 sentence = data.Field(sequential=True, fix_length=embedding_length, tokenize=data_processor.clean, pad_first=True, tensor_type=torch.LongTensor, lower=True) label = data.Field(sequential=False, use_vocab=False, tensor_type=torch.ByteTensor) fields = [('sentence_text', sentence), ('label', label)] examples = [] headlines = fake_headlines + real_headlines labels = [0] * len(fake_headlines) + [1] * len(real_headlines) for item in zip(headlines, labels): example = data.Example.fromlist(item, fields) examples.append(example) # random.shuffle(examples) sentence.build_vocab(data.Dataset(examples, fields), min_freq=3, vectors="glove.6B.100d") vocab = sentence.vocab embedding = torch.nn.Embedding( num_embeddings=len(vocab), embedding_dim=100, #TODO: change depending on final used word2vec ) embedding.weight.data.copy_(vocab.vectors) temp = list(zip(headlines, labels)) random.shuffle(temp) headlines, labels = zip(*temp) test_split = int(len(headlines) * split_ratio) val_split = int(len(headlines) * split_ratio) + test_split train = headlines[val_split:] val = headlines[test_split:val_split] test = headlines[:test_split] train_labels = labels[val_split:] val_labels = labels[test_split:val_split] test_labels = labels[:test_split] train = sentence.process(train, -1, True) val = sentence.process(val, -1, False) test = sentence.process(test, -1, False) with open('embedding_layer.pkl', 'wb') as f: pickle.dump(embedding, f, pickle.HIGHEST_PROTOCOL) with open('vocabstoi.pkl', 'wb') as f: pickle.dump(vocab.stoi, f, pickle.HIGHEST_PROTOCOL) return train, val, test, train_labels, val_labels, test_labels, embedding, vocab.stoi
def __init__(self, args): path = 'data/test_path' dataset_path = path + '/Medmentions/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' print("preprocessing data files...") if not os.path.exists('{}/{}l'.format(path, args.train_file)): self.preprocess_file('{}/{}'.format(path, args.train_file)) if not os.path.exists('{}/{}l'.format(path, args.dev_file)): self.preprocess_file('{}/{}'.format(path, args.dev_file)) self.RAW = data.RawField() # explicit declaration for torchtext compatibility self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 'p_label': ('p_label', self.LABEL), 'n_label': ('n_label', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'positive': [('p_word', self.WORD), ('p_char', self.CHAR)], 'negative': [('n_word', self.WORD), ('n_char', self.CHAR)] } list_fields = [('id', self.RAW), ('p_label', self.LABEL), ('n_label', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('p_word', self.WORD), ('p_char', self.CHAR), ('n_word', self.WORD), ('n_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.train, self.dev = data.TabularDataset.splits( path=path, train='{}l'.format(args.train_file), validation='{}l'.format(args.dev_file), format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) #cut too long context in the training set for efficiency. if args.context_threshold > 0: self.train.examples = [ e for e in self.train.examples if len(e.c_word) <= args.context_threshold ] print("building vocab...") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim)) print("building iterators...") device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator(self.train, batch_size=args.train_batch_size, device=device, repeat=True, shuffle=True, sort_key=lambda x: len(x.c_word)) self.dev_iter = data.BucketIterator(self.dev, batch_size=args.dev_batch_size, device=device, repeat=False, sort_key=lambda x: len(x.c_word))
# print(f'\n{model_name} test_acc={test_acc:.4f} oov_ratio={oov_ratio:.4f} oov_acc={oov_acc:.4f}') """ ORI'S MAIN: """ # train_data_fn = 'en-ud-train.upos.tsv' # test_data_fn = 'en-ud-dev.upos.tsv' train_data_fn = 'train_small.tsv' test_data_fn = 'test_small.tsv' TEXT, TAGS = data.Field(lower=True), data.Field(unk_token=None) fields = [('text', TEXT), ('tags', TAGS)] # TRAIN HMM MODEL corpus = load_annotated_corpus(train_data_fn) hmm_model = learn_params(corpus) test = "You are such a good boy!" A = hmm_model[4] B = hmm_model[5] tagged_base = baseline_tag_sentence(word_tokenize(test), hmm_model[1], hmm_model[0]) tagged_hmm = hmm_tag_sentence(word_tokenize(test), A, B) print(tagged_base) print(tagged_hmm) # GET TEST PERFORMANCE test_examples = get_examples_from_data(load_annotated_corpus(test_data_fn), fields) test_data = data.Dataset(test_examples, fields) test_hmm(hmm_model, test_data) print('\ndone')
torch.manual_seed(config.RANDOM_SEED) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 文本内容,使用自定义的分词方法,将内容转换为小写,设置最大长度等 TEXT = data.Field(tokenize=utils.en_seg, lower=True, fix_length=config.MAX_SENTENCE_SIZE, batch_first=True) # 文本对应的标签 LABEL = data.LabelField(dtype=torch.float) # 构建data数据 pos_examples, pos_fields = dataloader.get_dataset(config.POS_CORPUS_PATH, TEXT, LABEL, 'pos') neg_examples, neg_fields = dataloader.get_dataset(config.NEG_CORPUS_PATH, TEXT, LABEL, 'neg') all_examples, all_fields = pos_examples + neg_examples, pos_fields + neg_fields # 构建torchtext类型的数据集 total_data = data.Dataset(all_examples, all_fields) # 数据集切分 train_data, test_data = total_data.split(random_state=random.seed(config.RANDOM_SEED), split_ratio=0.8) # 切分后的数据查看 # # 数据维度查看 print('len of train data: %r' % len(train_data)) # len of train data: 8530 print('len of test data: %r' % len(test_data)) # len of test data: 2132 # # 抽一条数据查看 print(train_data.examples[100].text) # ['never', 'engaging', ',', 'utterly', 'predictable', 'and', 'completely', 'void', 'of', 'anything', 'remotely', # 'interesting', 'or', 'suspenseful'] print(train_data.examples[100].label) # 0