def get_embd(cfg, vocab): # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(cfg.input_dir(), 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = utils.load_word_vectors( os.path.join(cfg.glove_dir(), 'glove.840B.300d')) cfg.logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=cfg.device()) emb.normal_(0, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) return emb
def main(): """ Create a range of clusters and compare them """ word_vectors, used_words, unused_words = load_word_vectors(FILENAME) start = time.time() n_clusters = range(1, 21) print("Using cluster sizes from {} to {}".format(min(n_clusters), max(n_clusters))) kmeans_clusters = [KMeans(n_clusters=n).fit(word_vectors) for n in n_clusters] centroids = [k.cluster_centers_ for k in kmeans_clusters] D_k = [cdist(word_vectors, cent, 'euclidean') for cent in centroids] cIdx = [np.argmin(D, axis=1) for D in D_k] dist = [np.min(D, axis=1) for D in D_k] avgWithinSS = [sum(d) / len(word_vectors) for d in dist] # Total with-in sum of square wcss = [sum(d**2) for d in dist] tss = sum(pdist(word_vectors)**2)/len(word_vectors) bss = tss-wcss stop = time.time() print("Time taken for clustering: {} seconds.".format(stop - start)) print("Plotting elbow curve") plot_elbow_curve(n_clusters=n_clusters, avgWithinSS=avgWithinSS, bss=bss, tss=tss)
def main(): """ Open csv """ word_vectors, used_words, unused_words = load_word_vectors(FILENAME) start = time.time() n_clusters = 3 print("Clustering") kmeans_clustering_predict = KMeans(n_clusters=n_clusters) idx = kmeans_clustering_predict.fit_predict(word_vectors) clustered_words = {} for index, cluster in enumerate(idx): key = used_words[index] if key not in clustered_words: clustered_words[key] = cluster else: raise Exception("Key {} already exists!".format(key)) kmeans_clustering = kmeans_clustering_predict.fit(word_vectors) labels = kmeans_clustering.labels_ metrics.silhouette_score(word_vectors, labels, metric='euclidean') metrics.calinski_harabaz_score(word_vectors, labels) end = time.time() elapsed = end - start print("Time taken for clustering: {} seconds.".format(elapsed)) print("Silhouette score: {}".format( metrics.silhouette_score(word_vectors, labels, metric='euclidean'))) print("CH score: {}".format( metrics.calinski_harabaz_score(word_vectors, labels))) print("Saving word clusters to {}".format(CLUSTER_OUTPUT_FILE)) with open(CLUSTER_OUTPUT_FILE, "w", encoding="utf-8") as f: writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for clustered_word, cluster in clustered_words.items(): writer.writerow([clustered_word, cluster]) with open("wordsNOTvec_output.txt", "w", encoding="utf8") as f: for unused_word in unused_words: f.write(unused_word + "\n") print("Saving word vectors to {}".format(VECTOR_OUTPUT_FILE)) with open(VECTOR_OUTPUT_FILE, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i, word in enumerate(used_words): vectors = word_vectors[i] writer.writerow([ word, ] + vectors.tolist())
def load_glove(glove_path, vocab_size, word_dict): # glove_path = "/Users/cyy7645/Desktop/treelstm.pytorch-master/data/glove/" glove_vocab, glove_emb = utils.load_word_vectors( os.path.join(glove_path, 'glove.840B.300d')) # logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) # 初始化词向量矩阵 emb = torch.zeros(vocab_size, 300, dtype=torch.float) emb.normal_(0, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate(['<PAD>']): emb[idx].zero_() # 如果数据集的字典里的单词在权重表glove_vocab中出现,用glove_vocab中预定义的权重替换 for word in word_dict.keys(): if glove_vocab.getIndex(word): emb[word_dict[word]] = glove_emb[glove_vocab.getIndex( word)] return emb
def main(): """ Open csv """ args = parse_args() word_vectors, used_words, unused_words = load_word_vectors(args.input) with open(args.output, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i, word in enumerate(used_words): vectors = word_vectors[i] writer.writerow([ word, ] + vectors.tolist())
def main(): global args args = parse_args() vocab_file = os.path.join(args.dtree, 'snli_vocab_cased.txt') vocab = Vocab(filename=vocab_file) args.cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) l_train_file = os.path.join(args.dtree, args.premise_train) r_train_file = os.path.join(args.dtree, args.hypothesis_train) label_train_file = os.path.join(args.dtree, args.label_train) l_dev_file = os.path.join(args.dtree, args.premise_dev) r_dev_file = os.path.join(args.dtree, args.hypothesis_dev) label_dev_file = os.path.join(args.dtree, args.label_dev) l_test_file = os.path.join(args.dtree, args.premise_test) r_test_file = os.path.join(args.dtree, args.hypothesis_test) label_test_file = os.path.join(args.dtree, args.label_test) l_train_squence_file = os.path.join(args.ctree, args.premise_train) r_train_squence_file = os.path.join(args.ctree, args.hypothesis_train) l_dev_squence_file = os.path.join(args.ctree, args.premise_dev) r_dev_squence_file = os.path.join(args.ctree, args.hypothesis_dev) l_test_squence_file = os.path.join(args.ctree, args.premise_test) r_test_squence_file = os.path.join(args.ctree, args.hypothesis_test) print(l_train_file, l_dev_file, l_test_file) print(r_train_file, r_dev_file, r_test_file) print(label_train_file, label_dev_file, label_test_file) # load SICK dataset splits train_file = os.path.join(args.data, 'train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = NLIdataset(premise_tree=l_train_file, hypothesis_tree=r_train_file, premise_seq=l_train_squence_file, hypothesis_seq=r_train_squence_file, label=label_train_file, vocab=vocab, num_classes=3, args=args) torch.save(train_dataset, train_file) if args.savedev == 1: dev_file = os.path.join(args.data, 'dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = NLIdataset(premise_tree=l_dev_file, hypothesis_tree=r_dev_file, premise_seq=l_dev_squence_file, hypothesis_seq=r_dev_squence_file, label=label_dev_file, vocab=vocab, num_classes=3, args=args) torch.save(dev_dataset, dev_file) test_file = os.path.join(args.data, 'test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = NLIdataset(premise_tree=l_test_file, hypothesis_tree=r_test_file, premise_seq=l_test_squence_file, hypothesis_seq=r_test_squence_file, label=label_test_file, vocab=vocab, num_classes=3, args=args) torch.save(test_dataset, test_file) else: dev_dataset = NLIdataset(premise_tree=l_dev_file, hypothesis_tree=r_dev_file, premise_seq=l_dev_squence_file, hypothesis_seq=r_dev_squence_file, label=label_dev_file, vocab=vocab, num_classes=3, args=args) test_dataset = NLIdataset(premise_tree=l_test_file, hypothesis_tree=r_test_file, premise_seq=l_test_squence_file, hypothesis_seq=r_test_squence_file, label=label_test_file, vocab=vocab, num_classes=3, args=args) train_data_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=False) dev_data_loader = DataLoader(dev_dataset, batch_size=args.batchsize, shuffle=False) test_data_loader = DataLoader(test_dataset, batch_size=args.batchsize, shuffle=False) # for data in train_data_loader: # lsent, lgraph, rsent, rgraph, label = data # print(label) # break # # initialize model, criterion/loss_function, optimizer # model = TreeLSTMforNLI( # vocab.size(), # args.input_dim, # args.mem_dim, # args.hidden_dim, # args.num_classes, # args.sparse, # args.freeze_embed) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'snli_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = utils.load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=device) emb.normal_(0, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate(['_PAD_', '_UNK_', '_BOS_', '_EOS_']): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model # model.emb.weight.data.copy_(emb) model = ESIM(vocab.size(), args.input_dim, args.mem_dim, embeddings=emb, dropout=0.5, num_classes=args.num_classes, device=device, freeze=args.freeze_embed).to(device) criterion = nn.CrossEntropyLoss() model.to(device), criterion.to(device) if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) trainer = Trainer(args, model, criterion, optimizer, device) best = -999.0 best_loop = 0 for epoch in range(args.epochs): train_loss = trainer.train(train_data_loader) train_loss, train_acc = trainer.test(train_data_loader) dev_loss, dev_acc = trainer.test(dev_data_loader) test_loss, test_acc = trainer.test(test_data_loader) print('==> Epoch {}, Train \tLoss: {}\tAcc: {}'.format( epoch, train_loss, train_acc)) print('==> Epoch {}, Dev \tLoss: {}\tAcc: {}'.format( epoch, dev_loss, dev_acc)) print('==> Epoch {}, Test \tLoss: {}\tAcc: {}'.format( epoch, test_loss, test_acc)) if best < test_acc: best = test_acc best_loop = 0 print('Get Improvement,Save Model, The best performence is %f' % (best)) checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'acc': test_acc, 'args': args, 'epoch': epoch } print('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname)) else: best_loop += 1 if best_loop > args.patience: print('Early Stop,Best Acc:%f' % (best)) break
def main(): global args args = parse_args(type=1) print(args.name) print(args.model_name) if args.mem_dim == 0: if args.model_name == 'dependency': args.mem_dim = 168 elif args.model_name == 'constituency': args.mem_dim = 150 elif args.model_name == 'lstm': args.mem_dim = 168 elif args.model_name == 'bilstm': args.mem_dim = 168 if args.num_classes == 0: if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) elif args.num_classes == 2: # assert False # this will not work assert not args.fine_grain args.cuda = args.cuda and torch.cuda.is_available() # args.cuda = False print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files token_files = [ os.path.join(split, 'sents.toks') for split in [train_dir, dev_dir, test_dir] ] # vocab_file = os.path.join(args.data, 'vocab-cased.txt') # use vocab-cased if not os.path.isfile(vocab_file): build_vocab(token_files, vocab_file) # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB, USE OLD VOCAB # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file) # vocab.add(Constants.UNK) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data if args.model_name == 'dependency' or args.model_name == 'constituency': DatasetClass = SSTDataset elif args.model_name == 'lstm' or args.model_name == 'bilstm': DatasetClass = SeqSSTDataset # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = DatasetClass(train_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev dev_file = os.path.join(args.data, 'sst_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = DatasetClass(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(dev_dataset, dev_file) is_preprocessing_data = True # test test_file = os.path.join(args.data, 'sst_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = DatasetClass(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(test_dataset, test_file) is_preprocessing_data = True criterion = nn.NLLLoss() # initialize model, criterion/loss_function, optimizer if args.embedding == 'multi_channel': args.channel = 2 embedding_model2 = nn.Embedding(vocab.size(), args.input_dim) else: args.channel = 1 if args.model_name == 'dependency' or args.model_name == 'constituency': model = TreeLSTMSentiment(args.cuda, args.channel, args.input_dim, args.mem_dim, args.num_classes, args.model_name, criterion) elif args.model_name == 'lstm' or args.model_name == 'bilstm': model = LSTMSentiment(args.cuda, args.channel, args.input_dim, args.mem_dim, args.num_classes, args.model_name, criterion, pooling=args.pooling) embedding_model = nn.Embedding(vocab.size(), args.input_dim) if args.cuda: embedding_model = embedding_model.cuda() if args.channel == 2: embedding_model2 = embedding_model2.cuda() if args.cuda: model.cuda(), criterion.cuda() # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_split_token = ' ' if args.embedding == 'glove': emb_torch = 'sst_embed.pth' emb_vector = 'glove.840B.300d' emb_vector_path = os.path.join(args.glove, emb_vector) # assert os.path.isfile(emb_vector_path+'.txt') elif args.embedding == 'paragram': emb_torch = 'sst_embed_paragram.pth' emb_vector = 'paragram_300_sl999' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'paragram_xxl': emb_torch = 'sst_embed_paragram_xxl.pth' emb_vector = 'paragram-phrase-XXL' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'other': emb_torch = 'other.pth' emb_vector = args.embedding_other emb_vector_path = emb_vector emb_split_token = '\t' assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'multi_channel': emb_torch = 'sst_embed1.pth' emb_torch2 = 'sst_embed2.pth' emb_vector_path = args.embedding_other emb_vector_path2 = args.embedding_othert assert os.path.isfile(emb_vector_path + '.txt') assert os.path.isfile(emb_vector_path2 + '.txt') assert (os.path.isfile(emb_vector_path2 + '.txt'), emb_vector_path2) else: assert False emb_file = os.path.join(args.data, emb_torch) if os.path.isfile(emb_file): emb = torch.load(emb_file) print('load %s' % (emb_file)) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(emb_vector_path, emb_split_token) print('==> Embedding vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) # torch.save(emb, emb_file) glove_emb = None glove_vocab = None gc.collect() # add pretrain embedding # pretrain embedding would overwrite exist embedding from glove embed1_txt = os.path.join(args.state_dir, 'embed1') if os.path.isfile(embed1_txt + '.txt'): print('load %s' % (embed1_txt)) glove_vocab, glove_emb = load_word_vectors(embed1_txt, emb_split_token) print('==> embed1 vocabulary size: %d ' % glove_vocab.size()) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) # saved word embedding matrix is_preprocessing_data = True # flag to quit print('done creating emb, quit') if args.embedding == 'multi_channel': emb_file2 = os.path.join(args.data, emb_torch2) if os.path.isfile(emb_file2): emb2 = torch.load(emb_file2) print('load %s' % (emb_file2)) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(emb_vector_path2, emb_split_token) print('==> Embedding vocabulary size: %d ' % glove_vocab.size()) emb2 = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb2[vocab.getIndex(word)] = glove_emb[ glove_vocab.getIndex(word)] else: emb2[vocab.getIndex(word)] = torch.Tensor( emb2[vocab.getIndex(word)].size()).normal_( -0.05, 0.05) embed2_txt = os.path.join(args.state_dir, 'embed2') if os.path.isfile(embed2_txt + '.txt'): print('load %s' % (embed2_txt)) glove_vocab, glove_emb = load_word_vectors( embed2_txt, emb_split_token) print('==> embed1 vocabulary size: %d ' % glove_vocab.size()) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb2[vocab.getIndex(word)] = glove_emb[ glove_vocab.getIndex(word)] else: emb2[vocab.getIndex(word)] = torch.Tensor( emb2[vocab.getIndex(word)].size()).normal_( -0.05, 0.05) torch.save(emb2, emb_file2) glove_emb = None glove_vocab = None gc.collect() is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print('quit program') quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() if args.channel == 2: emb2 = emb2.cuda() embedding_model.state_dict()['weight'].copy_(emb) if args.channel == 2: embedding_model2.state_dict()['weight'].copy_(emb2) # load cnn, lstm state_dict here if args.state_dir != 'meow': #TODO: here model.load_state_files(args.state_dir) if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': # optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adam_combine': optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adagrad_combine': optimizer = optim.Adagrad([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adam_combine_v2': model.embedding_model = embedding_model optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) args.manually_emb = 0 metrics = Metrics(args.num_classes) utils.count_param(model) # create trainer object for training and testing # if args.model_name == 'dependency' or args.model_name == 'constituency': # trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) # elif args.model_name == 'lstm' or args.model_name == 'bilstm': # trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) if args.channel == 1: # trainer = MultiChannelSentimentTrainer(args, model, [embedding_model], criterion, optimizer) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) else: trainer = MultiChannelSentimentTrainer( args, model, [embedding_model, embedding_model2], criterion, optimizer) trainer.set_initial_emb(emb) # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer) test_idx_dir = os.path.join(args.data, args.test_idx) test_idx = None if os.path.isfile(test_idx_dir): print('load test idx %s' % (args.test_idx)) test_idx = np.load(test_idx_dir) mode = args.mode dev_loss, dev_pred, _ = trainer.test( dev_dataset) # make sure thing go smooth before train dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('before run dev percentage ', dev_acc) if mode == 'DEBUG': for epoch in range(args.epochs): # print a tree tree, sent, label = dev_dataset[3] utils.print_span(tree, sent, vocab) quit() dev_loss = trainer.train(dev_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, _ = trainer.test(test_dataset) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) # test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) elif mode == "PRINT_TREE": file_path = os.path.join('print_tree', args.name + '.npy') print_list = np.load(file_path) utils.print_trees_file_v2(args, vocab, test_dataset, print_list, name='tree') print('break') quit() elif mode == 'EVALUATE': print('EVALUATION') print('--Model information--') print(model) filename = args.name + '.pth' model = torch.load(os.path.join(args.saved, '_model_' + filename)) embedding_model = torch.load( os.path.join(args.saved, '_embedding_' + filename)) if args.channel == 1: trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) elif args.channel == 2: embedding_model2 = torch.load( os.path.join(args.saved, '_embedding2_' + filename)) trainer = MultiChannelSentimentTrainer( args, model, [embedding_model, embedding_model2], criterion, optimizer) test_loss, test_pred, subtree_metrics = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print(' |test percentage ' + str(test_acc)) result_filename = os.path.join(args.logs, args.name) + 'result.txt' rwriter = open(result_filename, 'w') for i in range(test_pred.size()[0]): rwriter.write( str(int(test_pred[i])) + ' ' + str(int(test_dataset.labels[i])) + '\n') rwriter.close() result_link = log_util.up_gist( result_filename, args.name, __file__, client_id='ec3ce6baf7dad6b7cf2c', client_secret='82240b38a7e662c28b2ca682325d634c9059efb0') print(result_link) print_list = subtree_metrics.print_list utils.print_trees_file_all(args, vocab, test_dataset, print_list, name='Tree') print('____________________' + str(args.name) + '___________________') elif mode == "EXPERIMENT": print('--Model information--') print(model) # dev_loss, dev_pred = trainer.test(dev_dataset) # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes) max_dev = 0 max_dev_epoch = 0 filename = args.name + '.pth' for epoch in range(args.epochs): train_loss_while_training = trainer.train(train_dataset) if epoch % 5 == 0: # save at least 1 hours train_loss, train_pred, _ = trainer.test(train_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels, num_classes=args.num_classes) print('Train acc %f ' % (train_acc)) dev_loss, dev_pred, _ = trainer.test(dev_dataset) dev_acc = metrics.sentiment_accuracy_score( dev_pred, dev_dataset.labels, num_classes=args.num_classes) print('==> Train loss : %f \t' % train_loss_while_training, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch %d dev percentage %f ' % (epoch, dev_acc)) if dev_acc > max_dev: print('update best dev acc %f ' % (dev_acc)) max_dev = dev_acc max_dev_epoch = epoch utils.mkdir_p(args.saved) torch.save(model, os.path.join(args.saved, '_model_' + filename)) torch.save(embedding_model, os.path.join(args.saved, '_embedding_' + filename)) if args.channel == 2: torch.save( embedding_model2, os.path.join(args.saved, '_embedding2_' + filename)) gc.collect() print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev)) print('eva on test set ') model = torch.load(os.path.join(args.saved, '_model_' + filename)) embedding_model = torch.load( os.path.join(args.saved, '_embedding_' + filename)) if args.channel == 1: trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) elif args.channel == 2: embedding_model2 = torch.load( os.path.join(args.saved, '_embedding2_' + filename)) trainer = MultiChannelSentimentTrainer( args, model, [embedding_model, embedding_model2], criterion, optimizer) test_loss, test_pred, subtree_metrics = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc)) print_list = subtree_metrics.print_list torch.save(print_list, os.path.join(args.saved, args.name + 'printlist.pth')) utils.print_trees_file(args, vocab, test_dataset, print_list, name='tree') print('____________________' + str(args.name) + '___________________') else: for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred, _ = trainer.test(train_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, subtree_metrics = trainer.test(test_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'train percentage ', train_acc) print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc) print_list = subtree_metrics.print_list torch.save(print_list, os.path.join(args.saved, args.name + 'printlist.pth')) utils.print_trees_file(args, vocab, test_dataset, print_list, name='tree')
def main(): global args args = parse_args() args.input_dim, args.mem_dim = 300, 150 args.hidden_dim, args.num_classes = 20, 2 args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: print('Sparsity and weight decay are incompatible, pick one!') exit() print(args) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [ os.path.join(split, 'toks.a') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'toks.b') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) print('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) print('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) print('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): word_new = word.decode("utf8") idx_set = [ glove_vocab.getIndex(token) for token in word_tokenize(word_new) ] idx_set = [id for id in idx_set if id is not None] if len(idx_set) != 0: idx_set = torch.LongTensor(idx_set) sum_emb = F.torch.sum(glove_emb.index_select(0, idx_set), 0) else: sum_emb = glove_emb[1] * 0 # for token in word_tokenize(word_new): # idx = glove_vocab.getIndex(token) # if idx is not None: # if sum_emb is None: # sum_emb = glove_emb[idx] # else: # sum_emb += glove_emb[idx] emb[vocab.getIndex(word)] = sum_emb torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) print(train_pred) dev_loss, dev_pred = trainer.test(dev_dataset) print(dev_pred) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.accuracy(train_pred, train_dataset.labels) print('==> Train Loss: {}\tPearson: {}\tL1: {}'.format( train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.accuracy(dev_pred, dev_dataset.labels) print('==> Dev Loss: {}\tPearson: {}\tL1: {}'.format( dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.accuracy(test_pred, test_dataset.labels) print('==> Test Loss: {}\tPearson: {}\tL1: {}'.format( test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } print('==> New optimum found, checkpointing everything now...') torch.save( checkpoint, '%s.pt' % os.path.join(args.save, args.expname + '.pth'))
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter( "[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname) + '.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info( '==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info( '==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info( '==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug( '==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
def main(): args = parse_args() print(args) args.cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim1, args.hidden_dim2, args.hidden_dim3, args.num_classes, args.sparse, args.att_hops, args.att_units, args.maxlen, args.dropout1, args.dropout2, args.dropout3, freeze_emb=True) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'asgd': optimizer = optim.ASGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') def adjust_learning_rate(optimizer, epoch): """Sets the learning rate to the initial LR decayed by 5 every 3 epochs""" lr = args.lr * (0.01**(epoch // 15)) for param_group in optimizer.param_groups: param_group['lr'] = lr for epoch in range(args.epochs): adjust_learning_rate(optimizer, epoch) train_loss = trainer.train(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset, mode='test') test_pearson = metrics.pearson(dev_pred, dev_dataset.labels) test_mse = metrics.mse(dev_pred, dev_dataset.labels) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch, 'vocab': vocab } torch.save( checkpoint, '%s.pt' % os.path.join( args.save, args.expname + '_' + str(test_pearson))) # Evaluate trainer.model.load_state_dict(checkpoint['model']) # trainer.train(train_dataset) test_loss, test_pred = trainer.test(test_dataset, mode='test') test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) # Final read out checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'vocab': vocab } torch.save( checkpoint, '%s.pt' % os.path.join(args.save, 'end_model_test' + str(test_pearson) + '.pt'))
def main(): global args args = parse_args(type=1) args.input_dim, args.mem_dim = 300, 168 if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) args.cuda = args.cuda and torch.cuda.is_available() print(args) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files token_files = [ os.path.join(split, 'sents.toks') for split in [train_dir, dev_dir, test_dir] ] vocab_file = os.path.join(args.data, 'vocab.txt') build_vocab(token_files, vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SSTDataset(train_dir, vocab, args.num_classes, args.fine_grain) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev dev_file = os.path.join(args.data, 'sst_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain) torch.save(dev_dataset, dev_file) is_preprocessing_data = True # test test_file = os.path.join(args.data, 'sst_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain) torch.save(test_dataset, test_file) is_preprocessing_data = True # initialize model, criterion/loss_function, optimizer model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.num_classes) criterion = nn.CrossEntropyLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) utils.count_param(model) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sst_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print( 'quit program due to memory leak during preprocess data, please rerun sentiment.py' ) quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = SentimentTrainer(args, model, criterion, optimizer) for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) # train_loss, train_pred = trainer.test(dev_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) # TODO: torch.Tensor(dev_dataset.labels) turn label into tensor # done dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc)
def main(): global args args = parse_args(type=1) print(args.name) print(args.model_name) args.input_dim = 300 if args.mem_dim == 0: if args.model_name == 'dependency': args.mem_dim = 168 elif args.model_name == 'constituency': args.mem_dim = 150 elif args.model_name == 'lstm': args.mem_dim = 168 elif args.model_name == 'bilstm': args.mem_dim = 168 if args.num_classes == 0: if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) elif args.num_classes == 2: # assert False # this will not work assert not args.fine_grain args.cuda = args.cuda and torch.cuda.is_available() # args.cuda = False print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files token_files = [ os.path.join(split, 'sents.toks') for split in [train_dir, dev_dir, test_dir] ] vocab_file = os.path.join(args.data, 'vocab-cased.txt') # use vocab-cased # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB, USE OLD VOCAB # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SSTDataset(train_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev dev_file = os.path.join(args.data, 'sst_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(dev_dataset, dev_file) is_preprocessing_data = True # test test_file = os.path.join(args.data, 'sst_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(test_dataset, test_file) is_preprocessing_data = True criterion = nn.NLLLoss() # initialize model, criterion/loss_function, optimizer model = DMNWraper(args.cuda, args.input_dim, args.mem_dim, criterion, args.train_subtrees, args.num_classes, args.embdrop) embedding_model = nn.Embedding(vocab.size(), args.input_dim) if args.cuda: embedding_model = embedding_model.cuda() if args.cuda: model.cuda(), criterion.cuda() # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors if args.embedding == 'glove': emb_torch = 'sst_embed.pth' emb_vector = 'glove.840B.300d' emb_vector_path = os.path.join(args.glove, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'paragram': emb_torch = 'sst_embed_paragram.pth' emb_vector = 'paragram_300_sl999' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'paragram_xxl': emb_torch = 'sst_embed_paragram_xxl.pth' emb_vector = 'paragram-phrase-XXL' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') else: assert False emb_file = os.path.join(args.data, emb_torch) if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(emb_vector_path) print('==> Embedding vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print('quit program') quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() embedding_model.state_dict()['weight'].copy_(emb) if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': # optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adam_combine': optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adagrad_combine': optimizer = optim.Adagrad([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adam_combine_v2': model.embedding_model = embedding_model optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) args.manually_emb = 0 metrics = Metrics(args.num_classes) utils.count_param(model) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_initial_emb(emb) question_idx = vocab.labelToIdx['sentiment'] question_idx = torch.Tensor([question_idx]) trainer.set_question(question_idx) # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer) mode = args.mode if mode == 'DEBUG': for epoch in range(args.epochs): # print a tree tree, sent, label = dev_dataset[3] utils.print_span(tree, sent, vocab) quit() dev_loss = trainer.train(dev_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, _ = trainer.test(test_dataset) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) elif mode == "PRINT_TREE": for i in range(0, 10): ttree, tsent, tlabel = dev_dataset[i] utils.print_tree(ttree, 0) print('_______________') print('break') quit() elif mode == 'EVALUATE': filename = args.name + '.pth' epoch = args.epochs model_name = str(epoch) + '_model_' + filename embedding_name = str(epoch) + '_embedding_' + filename model = torch.load(os.path.join(args.saved, model_name)) embedding_model = torch.load(os.path.join(args.saved, embedding_name)) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_question(question_idx) test_loss, test_pred, subtree_metrics = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print('Epoch with max dev:' + str(epoch) + ' |test percentage ' + str(test_acc)) print('____________________' + str(args.name) + '___________________') print_list = subtree_metrics.print_list torch.save(print_list, os.path.join(args.saved, args.name + 'printlist.pth')) utils.print_trees_file(args, vocab, test_dataset, print_list, name='tree') elif mode == "EXPERIMENT": # dev_loss, dev_pred = trainer.test(dev_dataset) # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes) max_dev = 0 max_dev_epoch = 0 filename = args.name + '.pth' for epoch in range(args.epochs): # train_loss, train_pred, _ = trainer.test(train_dataset) train_loss_while_training = trainer.train(train_dataset) train_loss, train_pred, _ = trainer.test(train_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) dev_acc = metrics.sentiment_accuracy_score( dev_pred, dev_dataset.labels, num_classes=args.num_classes) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels, num_classes=args.num_classes) print('==> Train loss : %f \t' % train_loss_while_training, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch %d dev percentage %f ' % (epoch, dev_acc)) print('Train acc %f ' % (train_acc)) if dev_acc > max_dev: print('update best dev acc %f ' % (dev_acc)) max_dev = dev_acc max_dev_epoch = epoch utils.mkdir_p(args.saved) torch.save( model, os.path.join(args.saved, str(epoch) + '_model_' + filename)) torch.save( embedding_model, os.path.join(args.saved, str(epoch) + '_embedding_' + filename)) gc.collect() print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev)) print('eva on test set ') model = torch.load( os.path.join(args.saved, str(max_dev_epoch) + '_model_' + filename)) embedding_model = torch.load( os.path.join(args.saved, str(max_dev_epoch) + '_embedding_' + filename)) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_question(question_idx) test_loss, test_pred, _ = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc)) print('____________________' + str(args.name) + '___________________') else: for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred, _ = trainer.test(train_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, _ = trainer.test(test_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'train percentage ', train_acc) print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc)
def prepare_to_train(data=None, glove=None): args = parse_args() if data is not None: args.data = data if glove is not None: args.glove = glove args.input_dim, args.mem_dim = 300, 150 args.hidden_dim, args.num_classes = 50, 5 args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: print('Sparsity and weight decay are incompatible, pick one!') exit() print(args) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) print('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) print('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) print('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.get_index(word): emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing #trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') return (args, best, train_dataset, dev_dataset, test_dataset, metrics, optimizer, criterion, model)
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) if not torch.cuda.is_available() and args.cuda: args.cuda = False logger.info("CUDA is unavailable, convert to cpu mode") if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) # set directory train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # load vocabulary vocab_path = os.path.join(args.data, "vocab.npy") vocab = Vocab( filename=vocab_path, labels=[constants.PAD_WORD, constants.UNK_WORD, constants.BOS_WORD, constants.EOS_WORD] ) logger.debug('==> vocabulary size : %d ' % len(vocab)) # load train dataset train_file = os.path.join(train_dir, "ERdata.pt") if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = ERDataset(train_dir, vocab, 2) torch.save(train_dataset, train_file) logger.debug('==> train data size: %d' % len(train_dataset)) # load dev dataset dev_file = os.path.join(dev_dir, "ERdata.pt") if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = ERDataset(dev_dir, vocab, 2) torch.save(dev_dataset, dev_file) logger.debug('==> dev data size: %d' % len(dev_dataset)) # load test dataset test_file = os.path.join(test_dir, "ERdata.pt") if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = ERDataset(test_dir, vocab, 2) torch.save(test_dataset, test_file) logger.debug('==> test data size: %d' % len(test_dataset)) # trainer: # tree model model = TreeModel( len(vocab), args.input_dim, args.mem_dim, 2, # 0-1 prediction args.sparse, args.freeze_embed ) # criterion criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() # optimizer if args.optim == 'adam': optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd ) elif args.optim == 'adagrad': optimizer = optim.Adagrad( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd ) elif args.optim == 'sgd': optimizer = optim.SGD( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd ) else: raise Exception("Unknown optimizer") # metrics metrics = Metrics(2) # 0-1 prediction # embeddings sent_emb_path = os.path.join(args.data, "sent_emb.pt") raw_sent_emb_path = os.path.join(args.glove, 'glove.840B.300d.txt') sent_emb = load_word_vectors(sent_emb_path, vocab, raw_sent_emb_path) logger.debug('==> sentence embedding size: %d * %d' % (sent_emb.size()[0], sent_emb.size()[1])) if args.cuda: sent_emb.cuda() model.sent_emb.weight.data.copy_(sent_emb) trainer = Trainer(args, model, criterion, optimizer) # train and test best = float("-inf") for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse)) if best < dev_pearson: best = dev_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': dev_pearson, 'mse': dev_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
def main(): global args args = parse_args() mkl.set_num_threads(1) args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: print('Sparsity and weight decay are incompatible, pick one!') exit() print(args) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) print('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) print('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) print('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.encoder_type, args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() trainable_parameters = [ param for param in model.parameters() if param.requires_grad ] if args.optim == 'adam': optimizer = optim.Adam(trainable_parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(trainable_parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(trainable_parameters, lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): # TODO '<s>', '</s>' these tokens present in glove w2v but probably with different meaning. # though they are not currently used emb[idx].zero_() for word in vocab.labelToIdx.keys(): if word in glove_vocab.labelToIdx.keys(): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.encoder.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) metric_functions = [metrics.pearson, metrics.mse] for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) pearson_stats, mse_stats = get_median_and_confidence_interval( train_pred, train_dataset.labels, metric_functions) print_results("Train", train_loss, pearson_stats, mse_stats) pearson_stats, mse_stats = get_median_and_confidence_interval( dev_pred, dev_dataset.labels, metric_functions) print_results("Dev", dev_loss, pearson_stats, mse_stats) pearson_stats, mse_stats = get_median_and_confidence_interval( test_pred, test_dataset.labels, metric_functions) print_results("Test", test_loss, pearson_stats, mse_stats)
def main(write_to): startTime = time.time() global args args = parse_args(type=1) args.input_dim = 300 if args.model_name == 'dependency': args.mem_dim = 168 elif args.model_name == 'constituency': args.mem_dim = 150 if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) args.cuda = args.cuda and torch.cuda.is_available() # args.cuda = False print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) # train_dir = os.path.join(args.data,'train/') train_dir = os.path.join( args.data, 'dev/') # Fei: wants to train on a smaller data set # dev_dir = os.path.join(args.data,'dev/') # test_dir = os.path.join(args.data,'test/') # write unique words from all token files token_files = [os.path.join(split, 'sents.toks') for split in [train_dir]] vocab_file = os.path.join(args.data, 'vocab-cased.txt') # use vocab-cased # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB, USE OLD VOCAB # vocab_file = os.path.join(args.data, 'vocab-cased-dev.txt') # build_vocab(token_files, vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SSTDataset(train_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev # dev_file = os.path.join(args.data,'sst_dev.pth') # if os.path.isfile(dev_file): # dev_dataset = torch.load(dev_file) # else: # dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name) # torch.save(dev_dataset, dev_file) # is_preprocessing_data = True # test # test_file = os.path.join(args.data,'sst_test.pth') # if os.path.isfile(test_file): # test_dataset = torch.load(test_file) # else: # test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name) # torch.save(test_dataset, test_file) # is_preprocessing_data = True criterion = nn.NLLLoss() # initialize model, criterion/loss_function, optimizer model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.num_classes, args.model_name, criterion) embedding_model = nn.Embedding(vocab.size(), args.input_dim) # Fei: don't optimize embedding embedding_model.weight.requires_grad = False if args.cuda: embedding_model = embedding_model.cuda() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': # optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adagrad( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr } # Fei: filter non_trainable ], lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) utils.count_param(model) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sst_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print('done preprocessing data, quit program to prevent memory leak') print('please run again') quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) embedding_model.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) loopStart = time.time() #print('prepare time is %s ' % (loopStart - startTime)) loss_save = [] mode = 'EXPERIMENT' if mode == 'DEBUG': for epoch in range(args.epochs): dev_loss = trainer.train(dev_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) elif mode == "PRINT_TREE": for i in range(0, 10): ttree, tsent, tlabel = dev_dataset[i] utils.print_tree(ttree, 0) print('_______________') print('break') quit() elif mode == "EXPERIMENT": max_dev = 0 max_dev_epoch = 0 filename = args.name + '.pth' for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) #dev_loss, dev_pred = trainer.test(dev_dataset) #dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") loss_save.append(train_loss) #print('Epoch ', epoch, 'dev percentage ', dev_acc) #torch.save(model, args.saved + str(epoch) + '_model_' + filename) #torch.save(embedding_model, args.saved + str(epoch) + '_embedding_' + filename) #if dev_acc > max_dev: # max_dev = dev_acc # max_dev_epoch = epoch #gc.collect() print("done") #print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev)) #print('eva on test set ') #model = torch.load(args.saved + str(max_dev_epoch) + '_model_' + filename) #embedding_model = torch.load(args.saved + str(max_dev_epoch) + '_embedding_' + filename) #trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) #test_loss, test_pred = trainer.test(test_dataset) #test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels) #print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc)) #print('____________________' + str(args.name) + '___________________') else: for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'train percentage ', train_acc) print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc) loopEnd = time.time() print('looptime is %s ' % (loopEnd - loopStart)) prepareTime = loopStart - startTime loopTime = loopEnd - loopStart timePerEpoch = loopTime / args.epochs with open(write_to, "w") as f: f.write("unit: " + "1 epoch\n") for loss in loss_save: f.write(str(loss) + "\n") f.write("run time: " + str(prepareTime) + " " + str(timePerEpoch) + "\n")
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]] token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args.freeze_embed) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
print('test score %.2f' % (cum_score / cum_samples)) if __name__ == '__main__': args = docopt(__doc__) textual_data_path = args['--textual-data-path'] visual_data_path = args['--visual-data-path'] batch_size = int(args['--batch-size']) delta = int(args['--delta']) K = int(args['--K']) threshold = float(args['--threshold']) word_embed_size=50 words, word_vectors = load_word_vectors('glove.6B.{}d.txt'.format(word_embed_size)) vocab = Vocab(words) if args['tacos']: dataset = TACoS(textual_data_path=textual_data_path, visual_data_path=visual_data_path, K=K, delta=delta, threshold=threshold) elif args['acnet']: dataset = ActivityNet(textual_data_path=textual_data_path, visual_data_path=visual_data_path, K=K, delta=delta, threshold=threshold) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('use device: %s' % device, file=sys.stderr) print('loading the model from %s ...' % args['--model-path']) model = TGN.load(args['--model-path']) model.to(device)
# + ###For changing embeddings # - # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab print("embedding") glove_vocab, glove_emb = utils.load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) #glove.840B.300d logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=device) emb.normal_(0, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model model.emb.weight.data.copy_(emb) # +
def train(): """ Build and Train model by given params """ # params # assigned after loading data max_seq_length = None exp_name = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") keep_prob = 0.5 n_hidden = 64 num_classes = 5 learning_rate = 1e-3 model_save_path = os.path.join(MODELS_BASE_DIR, exp_name + '.cpkt') train_iterations = 100000 eval_iterations = None batch_size = 24 word_vector_dim = 300 # ************** Pre-Model ************** # Load data data_params = data_loader.get_data_params(DATA_BASE_DIR) max_seq_length = data_params["max_seq_length"] X_train, X_eval, y_train, y_eval = data_loader.load_data( data_params, one_hot_labels=USE_ONE_HOT_LABELS) print("==> Loaded data") eval_iterations = math.ceil(float(X_eval.shape[0]) / batch_size) # Load GloVe embbeding vectors word_vectors = load_word_vectors(WORD_VECTORS_PATH) # Batch generators train_batch_generator = batch_generator_uniform_prob( (X_train, y_train), batch_size, num_classes) eval_batch_generator = batch_generator_uniform_prob( (X_eval, y_eval), batch_size, num_classes) # ************** Model ************** # placeholders labels = tf.placeholder(tf.float32, [None, num_classes]) input_data = tf.placeholder(tf.int32, [None, max_seq_length]) input_data_lengths = tf.placeholder(tf.int32, batch_size) # data processing data = tf.Variable(tf.zeros([batch_size, max_seq_length, word_vector_dim]), dtype=tf.float32) data = tf.nn.embedding_lookup(word_vectors, input_data) # lstm cell lstm_cell = tf.nn.rnn_cell.LSTMCell(n_hidden) if USE_DROPOUT: lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell, output_keep_prob=keep_prob) # Do we need the state tuple? Because we don't want the cell to be # initialized with the state from previous sentence ## rnn_tuple_state = tf.nn.rnn_cell.LSTMStateTuple(init_state[0], init_state[1]) if DYN_RNN_COPY_THROUGH_STATE: outputs, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32, sequence_length=input_data_lengths) else: outputs, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32) # output layer weight = tf.Variable(tf.truncated_normal([n_hidden, num_classes])) bias = tf.Variable(tf.constant(0.1, shape=[num_classes])) # Let's try this logic outputs = tf.transpose( outputs, [1, 0, 2]) # max_seq_length, batch_size, word_vector_dim last = tf.gather(outputs, int(outputs.get_shape()[0]) - 1) prediction = (tf.matmul(last, weight) + bias) correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # Metrics # Should we reduce_mean? loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels)) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Summaries tf.summary.scalar('Loss', loss) tf.summary.scalar('Accuracy', accuracy) merged = tf.summary.merge_all() logdir = os.path.join(LOGS_BASE_DIR, exp_name, "") # ************** Train ************** print("Run 'tensorboard --logdir={}' to checkout tensorboard logs.".format( os.path.abspath(logdir))) print("==> training") best_accuracy = -1 # Train with tf.Session() as sess: train_writer = tf.summary.FileWriter(os.path.join(logdir, "train")) eval_writer = tf.summary.FileWriter(os.path.join(logdir, "evaluation")) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Py2.7 or Py3 (if 2.7 --> Change to xrange) for iteration in tqdm.tqdm(range(train_iterations)): # shoudn't get exception, but check this # pass also X, y = next(train_batch_generator) X_lengths = get_lengths(X, PADD_VAL) if DEBUG: print("X.shape = {}, X_lengths.shape = {}".format( X.shape, X_lengths.shape)) print("y.shape = {}".format(y.shape)) print("type(X) = {}, type(X_lengths) = {}".format( X.dtype, X_lengths.dtype)) idx = 3 print("X[:{0}], X_length[:{0}]".format(idx)) print(X[:idx]) print(X_lengths[:idx]) sess.run([optimizer], feed_dict={ input_data: X, labels: y, input_data_lengths: X_lengths }) # Write summary if (iteration % 30 == 0): _summary, = sess.run([merged], feed_dict={ input_data: X, labels: y, input_data_lengths: X_lengths }) train_writer.add_summary(_summary, iteration) # evaluate the network every 1,000 iterations if (iteration % 1000 == 0 and iteration != 0): total_accuracy = 0 for eval_iteration in tqdm.tqdm(range(eval_iterations)): X, y = next(eval_batch_generator) X_lengths = get_lengths(X, PADD_VAL) _accuracy, _summary = sess.run([accuracy, merged], feed_dict={ input_data: X, labels: y, input_data_lengths: X_lengths }) total_accuracy += _accuracy average_accuracy = total_accuracy / eval_iterations print("accuracy = {}".format(average_accuracy)) if average_accuracy > best_accuracy: print("Best model!") save_path = saver.save(sess, model_save_path, global_step=iteration) print("saved to %s" % save_path) best_accuracy = average_accuracy eval_writer.close() train_writer.close()
def main(): global args args = parse_args(type=10) args.input_dim = 300 args.hidden_dim = 50 # args.input_dim, args.mem_dim = 300, 150 # args.hidden_dim, args.num_classes = 50, 5 if args.model_name == 'dependency': args.mem_dim = 150 elif args.model_name == 'constituency': args.mem_dim = 142 args.num_classes = 5 args.cuda = args.cuda and torch.cuda.is_available() print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) train_dir = os.path.join(args.data,'train/') dev_dir = os.path.join(args.data,'dev/') test_dir = os.path.join(args.data,'test/') # write unique words from all token files token_files_a = [os.path.join(split,'a.toks') for split in [train_dir,dev_dir,test_dir]] token_files_b = [os.path.join(split,'b.toks') for split in [train_dir,dev_dir,test_dir]] token_files = token_files_a+token_files_b sick_vocab_file = os.path.join(args.data,'vocab-cased.txt') # build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) print('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data,'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) print('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data,'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) print('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data,'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes ) embedding_model = nn.Embedding(vocab.size(), args.input_dim) if args.cuda: embedding_model = embedding_model.cuda() criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim=='adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim=='adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove,'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(),glove_emb.size(1)).normal_(-0.05,0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() embedding_model.state_dict()['weight'].copy_(emb) # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = SimilarityTrainer(args, model, embedding_model, criterion, optimizer) for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) print('==> Train loss : %f \t' % train_loss, end="") print('Train Pearson : %f \t' % metrics.pearson(train_pred,train_dataset.labels), end="") print('Train MSE : %f \t' % metrics.mse(train_pred,train_dataset.labels), end="\n") print('==> Dev loss : %f \t' % dev_loss, end="") print('Dev Pearson : %f \t' % metrics.pearson(dev_pred,dev_dataset.labels), end="") print('Dev MSE : %f \t' % metrics.mse(dev_pred,dev_dataset.labels), end="\n") print('==> Test loss : %f \t' % test_loss, end="") print('Test Pearson : %f \t' % metrics.pearson(test_pred,test_dataset.labels), end="") print('Test MSE : %f \t' % metrics.mse(test_pred,test_dataset.labels), end="\n")