def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size, dropout, n_layers, load_model_file, dst_model_file): init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format( cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] print('init robert ...') bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) robert_model = robert.Robert( bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) print('done') yelp_tv_idxs_file = os.path.join( config.RES_DIR, 'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt') amazon_tv_idxs_file = os.path.join( config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt') tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file print('loading data ...') idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file) logging.info('{} valid samples'.format(len(idxs_valid))) # idxs_valid = set(idxs_valid) valid_aspect_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_aspect_terms_file']) valid_opinion_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_opinion_terms_file']) print('done') bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=load_model_file, n_lstm_layers=n_layers) bertnrdj_model.pretrain( robert_model=robert_model, train_aspect_tfrec_file=dataset_files[ 'pretrain_train_aspect_tfrec_file'], valid_aspect_tfrec_file=dataset_files[ 'pretrain_valid_aspect_tfrec_file'], train_opinion_tfrec_file=dataset_files[ 'pretrain_train_opinion_tfrec_file'], valid_opinion_tfrec_file=dataset_files[ 'pretrain_valid_opinion_tfrec_file'], valid_tokens_file=dataset_files['pretrain_valid_token_file'], seq_length=seq_length, valid_aspect_terms_list=valid_aspect_terms_list, valid_opinion_terms_list=valid_opinion_terms_list, n_steps=n_steps, batch_size=batch_size, dropout=dropout, save_file=dst_model_file)
def __train(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 n_tags = 5 batch_size = 64 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = modelutils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, task) print('done') test_f1s = list() for i in range(5): logging.info('turn {}'.format(i)) model = RINANTE(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size, lamb=lamb) test_af1, _ = model.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file) test_f1s.append(test_af1) logging.info('r={} test_f1={:.4f}'.format(i, test_af1)) tf.reset_default_graph() logging.info('avg_test_f1={:.4f}'.format(sum(test_f1s) / len(test_f1s)))
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt' dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt' # dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrdj.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file)
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout, n_epochs, learning_rate, start_eval_epoch, n_layers): init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) logging.info('batch_size={}, learning_rate={}, dropout={}'.format( batch_size, learning_rate, dropout)) # model_file = dataset_files['pretrained_bertnrdj_file'] # model_file = None bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=model_file, n_lstm_layers=n_layers) bertnrdj_model.train( robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test, dropout=dropout, start_eval_spoch=start_eval_epoch, n_epochs=n_epochs, lr=learning_rate, )
def __train(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 n_tags = 5 batch_size = 32 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = modelutils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, task) print('done') model = RINANTE(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size, lamb=lamb) model.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file)
def __pre_train_nrdj(word_vecs_file, tok_texts_file, aspect_terms_file, opinion_terms_file, dst_model_file, task, lamb, lstm_l2, train_word_embeddings=False, load_model_file=None): init_logging('log/{}-pre-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 32 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info(aspect_terms_file) logging.info(opinion_terms_file) logging.info('dst: {}'.format(dst_model_file)) print('loading data ...') vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, aspect_terms_file, tok_texts_file, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, opinion_terms_file, tok_texts_file, 'opinion') print('done') logging.info('train_word_embeddings={} lstm_l2={}'.format( train_word_embeddings, lstm_l2)) nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, train_word_embeddings=train_word_embeddings, lamb=lamb, lstm_l2_src=lstm_l2, model_file=load_model_file, batch_size=batch_size) nrdj.pre_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2, vocab, n_epochs=50, lr=lr, save_file=dst_model_file)