def __train_lstmcrf(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task, error_file=None): init_logging('log/nr-{}.log'.format(str_today), mode='a', to_stdout=True) n_tags = 5 if task == 'both' else 3 print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) save_model_file = None train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, -1, task) # train_data, valid_data = __get_data_semeval(vocab, -1) # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE2_FILE) print('done') # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm) lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file) # print(valid_data.aspects_true_list) lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list, valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list, valid_data.opinions_true_list, n_epochs=n_epochs, save_file=save_model_file, error_file=error_file)
def __train_dlc(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file): init_logging('log/dlc-jtrain2-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 10 lr = 0.001 share_lstm = False print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) logging.info('word vec dim: {}, n_words={}'.format(word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, pre_aspect_terms_file, pre_tok_texts_file, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, pre_opinion_terms_file, pre_tok_texts_file, 'opinion') train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') dlc = DSLSTMCRF(word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=None, batch_size=batch_size) dlc.joint_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2, train_data, valid_data, test_data, n_epochs=n_epochs, lr=lr)
def __pre_train_nrdj(word_vecs_file, tok_texts_file, aspect_terms_file, opinion_terms_file, dst_model_file, task, load_model_file=None): init_logging('log/nrdj-pre-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, aspect_terms_file, tok_texts_file, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, opinion_terms_file, tok_texts_file, 'opinion') print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrj_train_data_src1 = nrj_train_data_src2 = None # if train_mode != 'target-only': # nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData( # train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list, # valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None # ) # nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData( # train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list, # valid_data_src2.labels_list, valid_data_src2.tok_texts, None, # valid_data_src2.opinions_true_list # ) nrdj.pre_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2, vocab, n_epochs=30, lr=lr, save_file=dst_model_file)
def __pretrain_lstmcrf(word_vecs_file, pre_tok_texts_file, pre_aspect_terms_file, pre_opinion_terms_file, dst_model_file, task): init_logging('log/nr-pre-{}.log'.format(str_today), mode='a', to_stdout=True) n_tags = 5 if task == 'both' else 3 print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) load_model_file = None save_model_file = dst_model_file if task == 'both': train_data, valid_data = datautils.get_data_amazon_ao( vocab, pre_aspect_terms_file, pre_opinion_terms_file, pre_tok_texts_file) elif task == 'aspect': train_data, valid_data = datautils.get_data_amazon( vocab, pre_aspect_terms_file, pre_tok_texts_file, task) else: train_data, valid_data = datautils.get_data_amazon( vocab, pre_opinion_terms_file, pre_tok_texts_file, task) print('done') # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm) lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file) # print(valid_data.aspects_true_list) lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list, valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list, valid_data.opinions_true_list, n_epochs=n_epochs, save_file=save_model_file)
def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size, dropout, n_layers, load_model_file, dst_model_file): init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format( cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] print('init robert ...') bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) robert_model = robert.Robert( bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) print('done') yelp_tv_idxs_file = os.path.join( config.RES_DIR, 'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt') amazon_tv_idxs_file = os.path.join( config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt') tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file print('loading data ...') idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file) logging.info('{} valid samples'.format(len(idxs_valid))) # idxs_valid = set(idxs_valid) valid_aspect_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_aspect_terms_file']) valid_opinion_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_opinion_terms_file']) print('done') bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=load_model_file, n_lstm_layers=n_layers) bertnrdj_model.pretrain( robert_model=robert_model, train_aspect_tfrec_file=dataset_files[ 'pretrain_train_aspect_tfrec_file'], valid_aspect_tfrec_file=dataset_files[ 'pretrain_valid_aspect_tfrec_file'], train_opinion_tfrec_file=dataset_files[ 'pretrain_train_opinion_tfrec_file'], valid_opinion_tfrec_file=dataset_files[ 'pretrain_valid_opinion_tfrec_file'], valid_tokens_file=dataset_files['pretrain_valid_token_file'], seq_length=seq_length, valid_aspect_terms_list=valid_aspect_terms_list, valid_opinion_terms_list=valid_opinion_terms_list, n_steps=n_steps, batch_size=batch_size, dropout=dropout, save_file=dst_model_file)
def __train_bert(): str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/bertlstmcrf3-{}.log'.format(str_today), mode='a', to_stdout=True) # dataset = 'se14r' dataset = 'se15r' dataset_files = config.DATA_FILES[dataset] if dataset == 'se14l': bert_embed_file_train = os.path.join( config.SE14_DIR, 'laptops/laptops_train_texts_tok_bert.txt') bert_embed_file_test = os.path.join( config.SE14_DIR, 'laptops/laptops_test_texts_tok_bert.txt') # dst_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt' # dst_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt' elif dataset == 'se14r': bert_embed_file_train = os.path.join( config.SE14_DIR, 'restaurants/restaurants_train_texts_tok_bert.txt') bert_embed_file_test = os.path.join( config.SE14_DIR, 'restaurants/restaurants_test_texts_tok_bert.txt') else: bert_embed_file_train = os.path.join( config.SE15_DIR, 'restaurants/restaurants_train_texts_tok_bert.txt') bert_embed_file_test = os.path.join( config.SE15_DIR, 'restaurants/restaurants_test_texts_tok_bert.txt') print('loading data ...') data_train, data_valid = bldatautils.load_train_data_bert( bert_embed_file_train, dataset_files['train_sents_file'], dataset_files['train_valid_split_file']) data_test = bldatautils.load_valid_data_bert( bert_embed_file_test, dataset_files['test_sents_file']) print('done') word_embed_dim = len(data_train.word_embed_seqs[0][0]) n_tags = 5 n_epochs = 100 lr = 0.001 # with open(word_vecs_file, 'rb') as f: # vocab, word_vecs_matrix = pickle.load(f) logging.info(dataset_files['test_sents_file']) logging.info('token_embed_dim={}'.format(word_embed_dim)) save_model_file = None lstmcrf = BertLSTMCRF(n_tags, word_embed_dim, hidden_size_lstm=500, batch_size=5) lstmcrf.train(data_train, data_valid, data_test, n_epochs=n_epochs, lr=lr)
def __train(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 n_tags = 5 batch_size = 64 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = modelutils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, task) print('done') test_f1s = list() for i in range(5): logging.info('turn {}'.format(i)) model = RINANTE(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size, lamb=lamb) test_af1, _ = model.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file) test_f1s.append(test_af1) logging.info('r={} test_f1={:.4f}'.format(i, test_af1)) tf.reset_default_graph() logging.info('avg_test_f1={:.4f}'.format(sum(test_f1s) / len(test_f1s)))
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt' dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt' # dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrdj.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file)
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout, n_epochs, learning_rate, start_eval_epoch, n_layers): init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) logging.info('batch_size={}, learning_rate={}, dropout={}'.format( batch_size, learning_rate, dropout)) # model_file = dataset_files['pretrained_bertnrdj_file'] # model_file = None bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=model_file, n_lstm_layers=n_layers) bertnrdj_model.train( robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test, dropout=dropout, start_eval_spoch=start_eval_epoch, n_epochs=n_epochs, lr=learning_rate, )
def __train(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 n_tags = 5 batch_size = 32 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = modelutils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, task) print('done') model = RINANTE(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size, lamb=lamb) model.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file)
def __train_bertlstm_ol(dataset): str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/{}-bertlstmcrfol-{}.log'.format(cur_script_name, str_today), mode='a', to_stdout=True) n_labels = 5 hidden_size_lstm = 200 batch_size = 16 n_epochs = 100 dropout = 0.5 dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) lstmcrf = BertLSTMCRF(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size) lstmcrf.train_ol(robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test, n_epochs=n_epochs, dropout=dropout)
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/nrdj-train-ns1-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 10 lr = 0.001 share_lstm = False print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrdj.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr)
def __train_lstmcrf_manual_feat(): init_logging('log/nrmf-{}.log'.format(str_today), mode='a', to_stdout=True) hidden_size_lstm = 100 n_epochs = 200 n_tags = 5 train_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-aspect-rule-result.txt' train_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-opinion-rule-result.txt' valid_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt' valid_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt' print('loading data ...') with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data, valid_data = datautils.get_data_semeval( config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, vocab, -1, 'both') train_aspect_feat_list = __get_manual_feat( config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_aspect_rule_result_file) train_opinion_feat_list = __get_manual_feat( config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_opinion_rule_result_file) train_feat_list = __merge_feat_list(train_aspect_feat_list, train_opinion_feat_list) valid_aspect_feat_list = __get_manual_feat( config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_aspect_rule_result_file) valid_opinion_feat_list = __get_manual_feat( config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_opinion_rule_result_file) valid_feat_list = __merge_feat_list(valid_aspect_feat_list, valid_opinion_feat_list) manual_feat_len = train_feat_list[0].shape[1] print('manual feat len: {}'.format(manual_feat_len)) lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, manual_feat_len=manual_feat_len) # print(valid_data.aspects_true_list) lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list, valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list, valid_data.opinions_true_list, train_feat_list=train_feat_list, valid_feat_list=valid_feat_list, n_epochs=n_epochs)
def __train_bertlstm_ol(): str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/bertlstmcrfol3-{}.log'.format(str_today), mode='a', to_stdout=True) # dataset = 'se14r' dataset = 'se15r' n_labels = 5 dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) lstmcrf = BertLSTMCRF(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=500, batch_size=5) lstmcrf.train_ol(robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test)
import datetime import os import numpy as np import logging from utils import utils from utils.loggingutils import init_logging from models.linearrank import LinearRank import config str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/flat-yelp-{}.log'.format(str_today), mode='a', to_stdout=True) def __run_linearrank(training_instances_file, val_linked_mentions_file, test_linked_mentions_file): flat_model = LinearRank(training_instances_file, val_linked_mentions_file, test_linked_mentions_file, config.YELP_CANDIDATES_FILE, cand_feat_files, config.YELP_MENTION_ID_IDX_FILE, config.YELP_BIZ_ID_TO_IDX_FILE, mention_feat_files, biz_feat_files, learning_rate, n_epochs, l2_reg, batch_size) acc_list = list() for i in range(n_rounds): print('Round {}'.format(i)) acc = flat_model.train() acc_list.append(acc) avg_best_acc = sum(acc_list) / len(acc_list)
def __train_neurule_double_joint(): init_logging('log/nrdj-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 # task = 'pretrain' task = 'train' label_opinions = True n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False train_mode = 'target-only' print('loading data ...') with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data_tar, valid_data_tar = datautils.get_data_semeval( config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, vocab, n_train, label_opinions) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, config.AMAZON_TERMS_TRUE2_FILE, config.AMAZON_TOK_TEXTS_FILE, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, config.AMAZON_TERMS_TRUE4_FILE, config.AMAZON_TOK_TEXTS_FILE, 'opinion') rule_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE if task == 'train' else None # rule_model_file = None pretrain_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE save_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=rule_model_file) nrj_train_data_src1 = nrj_train_data_src2 = None # if train_mode != 'target-only': nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData( train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list, valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None ) nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData( train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list, valid_data_src2.labels_list, valid_data_src2.tok_texts, None, valid_data_src2.opinions_true_list ) nrj_train_data_tar = NeuRuleDoubleJoint.TrainData( train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list, valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list, valid_data_tar.opinions_true_list ) if task == 'pretrain': nrdj.pre_train(nrj_train_data_src1, nrj_train_data_src2, vocab, n_epochs=n_epochs, lr=lr, save_file=pretrain_model_file) if task == 'train': nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode, n_epochs=n_epochs, lr=lr)
def __pre_train_nrdj(word_vecs_file, tok_texts_file, aspect_terms_file, opinion_terms_file, dst_model_file, task, lamb, lstm_l2, train_word_embeddings=False, load_model_file=None): init_logging('log/{}-pre-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 32 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info(aspect_terms_file) logging.info(opinion_terms_file) logging.info('dst: {}'.format(dst_model_file)) print('loading data ...') vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, aspect_terms_file, tok_texts_file, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, opinion_terms_file, tok_texts_file, 'opinion') print('done') logging.info('train_word_embeddings={} lstm_l2={}'.format( train_word_embeddings, lstm_l2)) nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, train_word_embeddings=train_word_embeddings, lamb=lamb, lstm_l2_src=lstm_l2, model_file=load_model_file, batch_size=batch_size) nrdj.pre_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2, vocab, n_epochs=50, lr=lr, save_file=dst_model_file)
import pickle from models.ncrfae import NeuCRFAutoEncoder import config from utils.loggingutils import init_logging from utils import datautils import logging import datetime if __name__ == '__main__': str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/ncrfae-train-{}.log'.format(str_today), mode='a', to_stdout=True) n_tags = 5 n_train = -1 label_opinions = True dataset = 'se14l' # dataset = 'se14r' dataset_files = config.DATA_FILES[dataset] word_vecs_file = dataset_files['word_vecs_file'] logging.info('word_vec_file: {}'.format(word_vecs_file)) logging.info(dataset_files['test_sents_file']) print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) # print(vocab) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} unlabeled_word_seqs = datautils.read_sents_to_word_idx_seqs(
def __train_nrdj_deep_restaurant_pr(): init_logging('log/nrdj-deep-restaurant-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 task = 'train' label_task = 'aspect' n_tags = 5 if label_task == 'both' else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 hidden_size_lstm = 100 n_epochs = 500 lr = 0.001 share_lstm = True # load_pretrained_model = True load_pretrained_model = False # train_mode = 'target-only' train_mode = 'all' aspect_terms_p_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-p.txt' aspect_terms_r_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-r.txt' # opinion_terms_file = 'd:/data/aspect/semeval14/restaurant/yelp-opinion-rule-result.txt' yelp_tok_texts_file = 'd:/data/res/yelp-review-eng-tok-sents-round-9.txt' rule_model_file = 'd:/data/aspect/semeval14/tf-model/drest/yelp-nrdj.ckpl' # rule_model_file = None load_model_file = None if task == 'train' and load_pretrained_model: load_model_file = rule_model_file # save_model_file = None if task == 'train' else rule_model_file save_model_file = rule_model_file if task == 'pretrain' else None print('loading data ...') with open(config.SE14_REST_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data_tar, valid_data_tar = datautils.get_data_semeval( config.SE14_REST_TRAIN_SENTS_FILE, config.SE14_REST_TRAIN_TOK_TEXTS_FILE, config.SE14_REST_TEST_SENTS_FILE, config.SE14_REST_TEST_TOK_TEXTS_FILE, vocab, n_train, label_task) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, aspect_terms_p_file, yelp_tok_texts_file, 'aspect') # train_data_src2, valid_data_src2 = datautils.get_data_amazon( # vocab, aspect_terms_r_file, yelp_tok_texts_file, 'opinion') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, aspect_terms_r_file, yelp_tok_texts_file, 'aspect') # train_data_src2, valid_data_src2 = datautils.get_data_amazon( # vocab, opinion_terms_file, yelp_tok_texts_file, 'opinion') print('done') nrdj = NeuRuleDoubleJointDeep(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file) nrj_train_data_src1 = nrj_train_data_src2 = None # if train_mode != 'target-only': nrj_train_data_src1 = NeuRuleDoubleJointDeep.TrainData( train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list, valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None) nrj_train_data_src2 = NeuRuleDoubleJointDeep.TrainData( train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list, valid_data_src2.labels_list, valid_data_src2.tok_texts, valid_data_src2.aspects_true_list, None) nrj_train_data_tar = NeuRuleDoubleJointDeep.TrainData( train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list, valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list, valid_data_tar.opinions_true_list) nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode, n_epochs=n_epochs, lr=lr)
mentions = utils.load_json_objs(mentions_file) mention_ids = {m['mention_id'] for m in mentions} mention_candidates = utils.load_candidates_for_mentions(config.YELP_CANDIDATES_FILE, mention_ids) mention_id_to_idx = utils.load_id_to_idx(mention_id_idx_file) biz_id_to_idx = utils.load_id_to_idx(config.YELP_BIZ_ID_TO_IDX_FILE) if for_pra: commuting_matrix_files = [os.path.join( config.YELP_DATA_DIR, 'network/{}_norm.txt'.format(s)) for s in path_strs] else: commuting_matrix_files = [os.path.join(config.YELP_DATA_DIR, 'network/{}.txt'.format(s)) for s in path_strs] gen_path_count_feats_file(config.YELP_DATA_INFO_FILE, mention_candidates, mention_id_to_idx, biz_id_to_idx, commuting_matrix_files, for_pra, dst_file) if __name__ == '__main__': init_logging('log/pc_feature_gen.log', mode='a', to_stdout=True) yelp_data_info_file = os.path.join(config.YELP_DATA_DIR, 'dataset-info.json') yelp_candidates_file = os.path.join(config.YELP_DATA_DIR, 'dataset/candidates.json') yelp_cs_candidates_file = os.path.join(config.YELP_DATA_DIR, 'casestudy/cs-mention-candidates.txt') # path_strs = ['MRURB'] # path_strs = ['MRURBRURB'] path_strs = ['MRUURB'] tag = 'pc' # tag = 'rw' NORM_THRES = 100 yelp_path_count_feat_file = os.path.join( config.LOCAL_DATA_DIR, '{}_features_{}.txt'.format(tag, path_strs[0])) __gen_yelp_path_count_feat(config.YELP_ALL_LINKED_MENTIONS_FILE, config.YELP_MENTION_ID_IDX_FILE, path_strs, False, yelp_path_count_feat_file)
for params in params_list: lr, lamb1, lamb2, lamb3, alpha1, alpha2, alpha3 = params if lamb1 == 0.01 and lamb2 == 0.01: continue mr = MarchRec(N_USERS + 1, N_ITEMS + 1, k, n_epoch, batch_size, lr, alpha1, alpha2, alpha3, lamb1, lamb2, lamb3) mr.fit(entries_train, entries_val, entries_test) # method = 'pmf' # method = 'biased_svd' method = 'dmf' str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/{}-{}.log'.format(method, str_today), to_stdout=True) split_id = 1 train_file = os.path.join(DATADIR, 'u{}_train.txt'.format(split_id)) val_file = os.path.join(DATADIR, 'u{}_val.txt'.format(split_id)) test_file = os.path.join(DATADIR, 'u{}.test'.format(split_id)) if method == 'pmf': __run_pmf(train_file, val_file, test_file) if method == 'biased_svd': __run_biased_svd(train_file, val_file, test_file) if method == 'dmf': __run_dmf(train_file, val_file, test_file) # __run_mlp(train_file, val_file) # __run_march(train_file, val_file, test_file)