def run_clf(ebd_type, clf_type, index=None, result_suffix=''): start = time.time() print('start {} with model {}:'.format(ebd_type, clf_type)) data, label = load_embedding(ebd_type, False, index) test_data, test_label = load_embedding(ebd_type, True) print('data loaded {}'.format( time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start)))) if clf_type == knn: clf = KNeighborsClassifier() elif clf_type == svm: clf = SVC() elif clf_type == random_forest: clf = RandomForestClassifier() else: print('undefined clf: {}'.format(clf_type)) clf.fit(data, label) print('model fitted {}'.format( time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start)))) result = clf.predict(test_data) print('model predicted {}'.format( time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start)))) with open( ebd_type + '_embedding/' + clf_type + '_result' + result_suffix + '.txt', 'w') as f: f.write(str(list(result))) print('end {} with model {}:'.format(ebd_type, clf_type))
def preload_wordvec_embed(dir_location): start = time.time() import dataloader embs = dataloader.load_embedding( os.path.join(dir_location, "embedding_filtered")) print("took {} seconds".format(time.time() - start)) print("preloaded embeddings from amazon dataset.") print("") return embs
def preload_embed(): start = time.time() import dataloader embs = dataloader.load_embedding( "/home/jessedd/data/amazon_categories/original_mix/embedding_filtered") print("took {} seconds".format(time.time() - start)) print("preloaded embeddings from amazon dataset.") print("") return embs
def evaluate_f1_score(dtype=''): _, label = load_embedding(fasttext, True) for clf in clf_list: for ebd in embedding_list: if dtype != origin: predict = load_result(ebd, clf, '_' + dtype) else: predict = load_result(ebd, clf, '') print('type:\t{}\t clf: \t{}\t ebd: \t{}\t f1-score: \t{:.2f}'. format(dtype, clf, ebd, f1_score(label, predict, average='macro')))
def __init__(self, embedding, hidden_size=150, depth=1, dropout=0.3, cnn=False, nclasses=2): super(Model, self).__init__() self.cnn = cnn self.drop = nn.Dropout(dropout) self.emb_layer = modules.EmbeddingLayer( embs=dataloader.load_embedding(embedding)) self.word2id = self.emb_layer.word2id if cnn: self.encoder = modules.CNN_Text(self.emb_layer.n_d, widths=[3, 4, 5], filters=hidden_size) d_out = 3 * hidden_size else: self.encoder = nn.LSTM( self.emb_layer.n_d, hidden_size // 2, depth, dropout=dropout, # batch_first=True, bidirectional=True) d_out = hidden_size # else: # self.encoder = SRU( # emb_layer.n_d, # args.d, # args.depth, # dropout = args.dropout, # ) # d_out = args.d self.out = nn.Linear(d_out, nclasses)
def main(args): if args.dataset == 'mr': data, label = dataloader.read_MR(args.path) elif args.dataset == 'subj': data, label = dataloader.read_SUBJ(args.path) elif args.dataset == 'cr': data, label = dataloader.read_CR(args.path) elif args.dataset == 'mpqa': data, label = dataloader.read_MPQA(args.path) elif args.dataset == 'trec': train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path) data = train_x + test_x label = None elif args.dataset == 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path) data = train_x + valid_x + test_x label = None else: raise Exception("unknown dataset: {}".format(args.dataset)) emb_layer = modules.EmbeddingLayer( args.d, data, embs = dataloader.load_embedding(args.embedding) ) if args.dataset == 'trec': train_x, train_y, valid_x, valid_y = dataloader.cv_split2( train_x, train_y, nfold = 10, valid_id = args.cv ) elif args.dataset != 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split( data, label, nfold = 10, test_id = args.cv ) nclasses = max(train_y)+1 train_x, train_y = dataloader.create_batches( train_x, train_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) valid_x, valid_y = dataloader.create_batches( valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) test_x, test_y = dataloader.create_batches( test_x, test_y, args.batch_size, emb_layer.word2id, sort = args.dataset == 'sst' ) model = Model(args, emb_layer, nclasses).cuda() need_grad = lambda x: x.requires_grad optimizer = optim.Adam( filter(need_grad, model.parameters()), lr = args.lr ) best_valid = 1e+8 test_err = 1e+8 for epoch in range(args.max_epoch): best_valid, test_err = train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err ) if args.lr_decay>0: optimizer.param_groups[0]['lr'] *= args.lr_decay sys.stdout.write("best_valid: {:.6f}\n".format( best_valid )) sys.stdout.write("test_err: {:.6f}\n".format( test_err ))
logger.info(f"COMMAND: {cmd_msg}") write_to_file(f"{Meta.log_path}/cmd.txt", cmd_msg) logger.info(f"Config: {Meta.config}") write_to_file(f"{Meta.log_path}/config.txt", Meta.config) datasets = {} data = [] for task_name in args.task: dataset, task_data = load_data(args.data_dir, task_name, args.cv) datasets[task_name] = dataset data += task_data emb_layer = EmbeddingLayer( args.dim, data, embs=load_embedding(args.embedding), fix_emb=args.fix_emb ) dataloaders = [] for task_name in args.task: dataloaders += create_dataloaders( task_name, datasets[task_name], args.batch_size, emb_layer.word2id ) tasks = { task_name: create_task( task_name, args, datasets[task_name]["nclasses"], emb_layer ) for task_name in args.task }
write_to_file(f"{Meta.log_path}/cmd.txt", cmd_msg) logger.info(f"Config: {Meta.config}") write_to_file(f"{Meta.log_path}/config.txt", Meta.config) datasets = {} data = [] for task_name in args.task: dataset, task_data = load_data(args.data_dir, task_name, args.cv) datasets[task_name] = dataset data += task_data emb_layer = EmbeddingLayer(args.dim, data, embs=load_embedding(args.embedding), fix_emb=args.fix_emb) dataloaders = [] for task_name in args.task: dataloaders += create_dataloaders(task_name, datasets[task_name], args.batch_size, emb_layer.word2id) tasks = { task_name: create_task(task_name, args, datasets[task_name]["nclasses"], emb_layer) for task_name in args.task } model = EmmentalModel(name="TC_task")
def train(): cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') cmd.add_argument('--seed', default=1, type=int, help='The random seed.') cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') cmd.add_argument('--train_path', required=True, help='The path to the training file.') cmd.add_argument('--valid_path', help='The path to the development file.') cmd.add_argument('--test_path', help='The path to the testing file.') cmd.add_argument('--config_path', required=True, help='the path to the config file.') cmd.add_argument("--word_embedding", help="The path to word vectors.") cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'], help='the type of optimizer: valid options=[sgd, adam, adagrad]') cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.') cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.') cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.') cmd.add_argument('--save_classify_layer', default=False, action='store_true', help="whether to save the classify layer") cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.') opt = cmd.parse_args(sys.argv[2:]) with open(opt.config_path, 'r') as fin: config = json.load(fin) # Dump configurations print(opt) print(config) # set seed. torch.manual_seed(opt.seed) random.seed(opt.seed) if opt.gpu >= 0: torch.cuda.set_device(opt.gpu) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) use_cuda = opt.gpu >= 0 and torch.cuda.is_available() token_embedder_name = config['token_embedder']['name'].lower() token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None) if token_embedder_name == 'cnn': train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': train_data = read_corpus(opt.train_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('training instance: {}, training tokens: {}.'.format(len(train_data), sum([len(s) - 1 for s in train_data]))) if opt.valid_path is not None: if token_embedder_name == 'cnn': valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': valid_data = read_corpus(opt.valid_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data), sum([len(s) - 1 for s in valid_data]))) elif opt.valid_size > 0: train_data, valid_data = divide(train_data, opt.valid_size) logging.info('training instance: {}, training tokens after division: {}.'.format( len(train_data), sum([len(s) - 1 for s in train_data]))) logging.info('valid instance: {}, valid tokens: {}.'.format( len(valid_data), sum([len(s) - 1 for s in valid_data]))) else: valid_data = None if opt.test_path is not None: if token_embedder_name == 'cnn': test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': test_data = read_corpus(opt.test_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('testing instance: {}, testing tokens: {}.'.format( len(test_data), sum([len(s) - 1 for s in test_data]))) else: test_data = None if opt.word_embedding is not None: embs = load_embedding(opt.word_embedding) word_lexicon = {word: i for i, word in enumerate(embs[0])} else: embs = None word_lexicon = {} # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification vocab = get_truncated_vocab(train_data, opt.min_count) # Ensure index of '<oov>' is 0 for special_word in ['<oov>', '<bos>', '<eos>', '<pad>']: if special_word not in word_lexicon: word_lexicon[special_word] = len(word_lexicon) for word, _ in vocab: if word not in word_lexicon: word_lexicon[word] = len(word_lexicon) # Word Embedding if config['token_embedder']['word_dim'] > 0: word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs) logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id))) else: word_emb_layer = None logging.info('Vocabulary size: {0}'.format(len(word_lexicon))) # Character Lexicon if config['token_embedder']['char_dim'] > 0: char_lexicon = {} for sentence in train_data: for word in sentence: for ch in word: if ch not in char_lexicon: char_lexicon[ch] = len(char_lexicon) for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']: if special_char not in char_lexicon: char_lexicon[special_char] = len(char_lexicon) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None train = create_batches( train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) if opt.eval_steps is None: opt.eval_steps = len(train[0]) logging.info('Evaluate every {0} batches.'.format(opt.eval_steps)) if valid_data is not None: valid = create_batches( valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: valid = None if test_data is not None: test = create_batches( test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: test = None label_to_ix = word_lexicon logging.info('vocab size: {0}'.format(len(label_to_ix))) nclasses = len(label_to_ix) model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda) logging.info(str(model)) if use_cuda: model = model.cuda() need_grad = lambda x: x.requires_grad if opt.optimizer.lower() == 'adam': optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'sgd': optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr) else: raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower())) try: os.makedirs(opt.model) except OSError as exception: if exception.errno != errno.EEXIST: raise if config['token_embedder']['char_dim'] > 0: with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: for ch, i in char_emb_layer.word2id.items(): print('{0}\t{1}'.format(ch, i), file=fpo) with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: for w, i in word_lexicon.items(): print('{0}\t{1}'.format(w, i), file=fpo) json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) best_train = 1e+8 best_valid = 1e+8 test_result = 1e+8 for epoch in range(opt.max_epoch): best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer, train, valid, test, best_train, best_valid, test_result) if opt.lr_decay > 0: optimizer.param_groups[0]['lr'] *= opt.lr_decay if valid_data is None: logging.info("best train ppl: {:.6f}.".format(best_train)) elif test_data is None: logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid)) else: logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
def load_result(ebd_type, clf_type, result_suffix=''): dir = ebd_type + '_embedding/' + clf_type + '_result' + result_suffix + '.txt' with open(dir) as f: line = f.readline() return list(map(int, line.strip().strip('[').strip(']').split(','))) def evaluate_f1_score(dtype=''): _, label = load_embedding(fasttext, True) for clf in clf_list: for ebd in embedding_list: if dtype != origin: predict = load_result(ebd, clf, '_' + dtype) else: predict = load_result(ebd, clf, '') print('type:\t{}\t clf: \t{}\t ebd: \t{}\t f1-score: \t{:.2f}'. format(dtype, clf, ebd, f1_score(label, predict, average='macro'))) if __name__ == '__main__': # for dtype in data_type_list: # dtype = small # for clf in clf_list: # for ebd in embedding_list: # run_clf(ebd, clf, load_index(dtype), '_' + dtype) # for dt in data_type_list: # evaluate_f1_score(dt) ebd, _ = load_embedding(transformer) print(len(ebd[0]))
def main(args): datasetList = ['mr', 'subj', 'cr', 'mpqa', 'trec', 'sst'] numberOfTest = 5 args.max_epoch = 100 for dset in datasetList: if dset == 'mr': data, label = dataloader.read_MR(args.path) elif dset == 'subj': data, label = dataloader.read_SUBJ(args.path) elif dset == 'cr': data, label = dataloader.read_CR(args.path) elif dset == 'mpqa': data, label = dataloader.read_MPQA(args.path) elif dset == 'trec': train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path) data = train_x + test_x label = None elif dset == 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path) data = train_x + valid_x + test_x label = None else: raise Exception("unknown dataset: {}".format(dset)) emb_layer = modules.EmbeddingLayer( args.d, data, embs = dataloader.load_embedding(args.embedding) ) if dset == 'trec': train_x, train_y, valid_x, valid_y = dataloader.cv_split2( train_x, train_y, nfold = 10, valid_id = args.cv ) elif dset != 'sst': train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split( data, label, nfold = 10, test_id = args.cv ) nclasses = max(train_y)+1 train_x, train_y = dataloader.create_batches(train_x, train_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') valid_x, valid_y = dataloader.create_batches(valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') test_x, test_y = dataloader.create_batches(test_x, test_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst') for models in range(3): if models == 1: args.cnn = True modelName = 'CNN' elif models == 2: args.cnn = False args.lstm = True modelName = 'LSTM' else: args.lstm = False modelName = 'SRU' sys.stdout.write("Training {} with {} architecture: \n".format(dset,modelName)) args.dropout = 0.5 for testNo in range(numberOfTest): model = Model(args, emb_layer, nclasses).cuda() need_grad = lambda x: x.requires_grad optimizer = optim.Adam(filter(need_grad, model.parameters()), lr = args.lr) best_valid = 1e+8 test_err = 1e+8 results = [] for epoch in range(args.max_epoch): results.append(train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err)) with open('results_{d}_{m}_{i}.csv'.format(d=dset, m=modelName, i=(testNo+1)), 'wb') as dump: wr = csv.writer(dump, delimiter=',') wr.writerow(['Epoch','Training Loss', 'Validation Error', 'Test Error', 'Duration']) for idx, value in enumerate(results): wr.writerow(value)