def load_data(dataset, n_train, n_val): print("Loading embeddings... This can take a while the first time.") return encode_sentiment_data( dataset, embeddings.GloveEmbedding("wikipedia_gigaword", d_emb=EMBEDDING_SIZE, show_progress=True), n_train, n_val, )
def __init__(self, args): super().__init__() self.glove = E.GloveEmbedding('wikipedia_gigaword', 300, default='zero') ### Start of your code self.linear1 = nn.Linear(300, 100) self.tanh = nn.Tanh() self.linear2 = nn.Linear(100, 1) self.sigmoid = nn.Sigmoid() ### End of your code # do not touch this line below self.optim = torch.optim.Adam(self.parameters(), args.learning_rate)
def __init__(self, args): super().__init__() self.glove = E.GloveEmbedding('wikipedia_gigaword', 300, default='zero') self.lossfunction = torch.nn.BCELoss() ### Start of your code self.fullyConnectedOne = torch.nn.Sequential(torch.nn.Linear(300, 100), torch.nn.Tanh()) self.outputLayer = torch.nn.Sequential(torch.nn.Linear(100, 1), torch.nn.Sigmoid()) # do not touch this line below self.optim = torch.optim.Adam(self.parameters(), args.learning_rate)
def from_file(cls, root, dspider, dcache, debug=False): train_database, dev_database = editsql_preprocess.read_db_split(dspider) conv = converter.Converter() kmaps = evaluation.build_foreign_key_map_from_json(os.path.join(dspider, 'tables.json')) splits = {} for k in ['train', 'dev']: with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f: splits[k] = [] for ex in json.load(f): splits[k].append(ex) if debug and len(splits[k]) > 100: break tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache) sql_voc = Vocab(['PAD', 'EOS', 'GO', 'SEP', '`', "'", '1', '%', 'yes', '2', '.', '5', 'f', 'm', 'name', 'song', 't', 'l']) # make contexts and populate vocab for s, data in splits.items(): proc = [] for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))): for turn_i, turn in enumerate(ex['interaction']): turn['id'] = '{}/{}:{}'.format(ex['database_id'], i, turn_i) turn['db_id'] = ex['database_id'] turn['prev'] = ex['interaction'][turn_i-1] if turn_i > 0 else None new = cls.make_example(turn, tokenizer, sql_voc, kmaps, conv, train=s=='train') if new is not None and (s != 'train' or not new['invalid']): proc.append(new) splits[s] = proc # make candidate list using vocab for s, data in splits.items(): for ex in data: ex['cands_query'], ex['cands_value'] = cls.make_cands(ex, sql_voc) splits[s] = data # make pointers for training data for ex in splits['train']: ex['pointer_query'], ex['pointer_value'] = cls.make_query_pointer(ex['sup_query'], ex['cands_query'], ex['cands_value'], sql_voc) # look up pretrained word embeddings emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero') sql_emb = torch.tensor([emb.emb(w) for w in sql_voc._index2word]) ext = dict(sql_voc=sql_voc, sql_emb=sql_emb) return splits, ext
def from_file(cls, root, dcache, debug=False): conv = converter.Converter(os.path.join(root, 'tables.json')) splits = {} for k in ['train', 'dev']: with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f: splits[k] = [] for ex in json.load(f): ex['query_orig'] = ex['query'] splits[k].append(ex) if debug and len(splits[k]) > 100: break tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache) utt_voc = Vocab(['PAD', 'EOS', 'GO']) # make contexts and populate vocab for s, data in splits.items(): proc = [] for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))): ex['id'] = '{}/{}'.format(ex['db_id'], i) new = cls.make_example(ex, tokenizer, utt_voc, conv, train=s=='train') if new is not None and (s != 'train' or not new['invalid']): proc.append(new) splits[s] = proc # make candidate list using vocab for s, data in splits.items(): for ex in data: ex['cands_question'] = cls.make_cands(ex, utt_voc) splits[s] = data # make pointers for training data for ex in splits['train']: ex['pointer_question'] = cls.make_question_pointer(ex['sup_question'], ex['cands_question'], utt_voc) # look up pretrained word embeddings emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero') utt_emb = torch.tensor([emb.emb(w) for w in utt_voc._index2word]) ext = dict(utt_voc=utt_voc, utt_emb=utt_emb) return splits, ext
) print(f"missing pre-trained embedding for {len(unks)} unknown words") return (X_train, y_train), (X_val, y_val) if __name__ == "__main__": train_size = 450 validation_size = 100 learning_rate = 0.01 max_epochs = 250 (X_train, y_train), (X_val, y_val) = encode_sentiment_data( load_dataset("glue", "sst2"), embeddings.GloveEmbedding("wikipedia_gigaword", d_emb=50, show_progress=True), train_size, validation_size, ) model_trainer = SentenceSentimentTrain( CNNSentimentKim(feature_map_size=100, filter_sizes=[3, 4, 5], dropout=0.25)) model_trainer.train( (X_train, y_train), learning_rate, max_epochs=max_epochs, data_val=(X_val, y_val), )
#!/usr/bin/env python import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) import random import time from nltk.corpus import brown import tqdm import embeddings as E if __name__ == '__main__': random.seed(0) n_samples = 10000 k1 = E.KazumaCharEmbedding(check_same_thread=True) k2 = E.KazumaCharEmbedding(check_same_thread=False) g1 = E.GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, check_same_thread=True) g2 = E.GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, check_same_thread=False) for w in ['canada', 'vancouver', 'toronto']: assert(k1.emb(w) == k2.emb(w)) assert(g1.emb(w) == g2.emb(w))
if not os.path.isdir(dout): os.makedirs(dout) print('Flattening train') train = create_split(train_trees, vocab) print('Flattening dev') dev = create_split(dev_trees, vocab) par = joblib.Parallel(12) print('Segmenting train') train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train)) train_filtered = [] for ex, ba in zip(train, train_ba): if ba: ex.update(ba) train_filtered.append(ex) print('filtered train from {} to {}'.format(len(train), len(train_filtered))) print('vocab size {}'.format(len(vocab))) emb = embeddings.ConcatEmbedding( [embeddings.GloveEmbedding(), embeddings.KazumaCharEmbedding()], default='zero') mat = torch.Tensor([emb.emb(w) for w in vocab._index2word]) torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt') torch.save(train_filtered, dout + '/proc_train.pt') torch.save(dev, dout + '/proc_dev.pt')
train_trees = json.load(f) with open('sharc/trees_dev.json') as f: dev_trees = json.load(f) dout = 'sharc/editor_disjoint' if not os.path.isdir(dout): os.makedirs(dout) print('Flattening train') train = create_split(train_trees, vocab) print('Flattening dev') dev = create_split(dev_trees, vocab) par = joblib.Parallel(12) print('Segmenting train') train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train)) train_filtered = [] for ex, ba in zip(train, train_ba): if ba: ex.update(ba) train_filtered.append(ex) print('filtered train from {} to {}'.format(len(train), len(train_filtered))) print('vocab size {}'.format(len(vocab))) emb = embeddings.ConcatEmbedding([embeddings.GloveEmbedding(), embeddings.KazumaCharEmbedding()], default='zero') mat = torch.Tensor([emb.emb(w) for w in vocab._index2word]) torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt') torch.save(train_filtered, dout + '/proc_train.pt') torch.save(dev, dout + '/proc_dev.pt')
#!/usr/bin/env python import random import time from nltk.corpus import brown import tqdm import embeddings as E if __name__ == '__main__': random.seed(0) n_samples = 10000 emb = E.GloveEmbedding() times = [] vocab = list(brown.words()) samples = [random.choice(vocab) for i in range(n_samples)] for w in tqdm.tqdm(samples): start = time.time() emb.emb(w) end = time.time() times.append(end - start) print(sum(times) / len(times))