def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size, dropout, n_layers, load_model_file, dst_model_file): init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format( cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] print('init robert ...') bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) robert_model = robert.Robert( bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) print('done') yelp_tv_idxs_file = os.path.join( config.RES_DIR, 'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt') amazon_tv_idxs_file = os.path.join( config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt') tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file print('loading data ...') idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file) logging.info('{} valid samples'.format(len(idxs_valid))) # idxs_valid = set(idxs_valid) valid_aspect_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_aspect_terms_file']) valid_opinion_terms_list = __load_terms_list( idxs_valid, dataset_files['pretrain_opinion_terms_file']) print('done') bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=load_model_file, n_lstm_layers=n_layers) bertnrdj_model.pretrain( robert_model=robert_model, train_aspect_tfrec_file=dataset_files[ 'pretrain_train_aspect_tfrec_file'], valid_aspect_tfrec_file=dataset_files[ 'pretrain_valid_aspect_tfrec_file'], train_opinion_tfrec_file=dataset_files[ 'pretrain_train_opinion_tfrec_file'], valid_opinion_tfrec_file=dataset_files[ 'pretrain_valid_opinion_tfrec_file'], valid_tokens_file=dataset_files['pretrain_valid_token_file'], seq_length=seq_length, valid_aspect_terms_list=valid_aspect_terms_list, valid_opinion_terms_list=valid_opinion_terms_list, n_steps=n_steps, batch_size=batch_size, dropout=dropout, save_file=dst_model_file)
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout, n_epochs, learning_rate, start_eval_epoch, n_layers): init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name, utils.get_machine_name(), str_today), mode='a', to_stdout=True) dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) logging.info('batch_size={}, learning_rate={}, dropout={}'.format( batch_size, learning_rate, dropout)) # model_file = dataset_files['pretrained_bertnrdj_file'] # model_file = None bertnrdj_model = BertNRDJ(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size, model_file=model_file, n_lstm_layers=n_layers) bertnrdj_model.train( robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test, dropout=dropout, start_eval_spoch=start_eval_epoch, n_epochs=n_epochs, lr=learning_rate, )
def __train_bertlstm_ol(dataset): str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/{}-bertlstmcrfol-{}.log'.format(cur_script_name, str_today), mode='a', to_stdout=True) n_labels = 5 hidden_size_lstm = 200 batch_size = 16 n_epochs = 100 dropout = 0.5 dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) lstmcrf = BertLSTMCRF(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=hidden_size_lstm, batch_size=batch_size) lstmcrf.train_ol(robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test, n_epochs=n_epochs, dropout=dropout)
def __train_bertlstm_ol(): str_today = datetime.date.today().strftime('%y-%m-%d') init_logging('log/bertlstmcrfol3-{}.log'.format(str_today), mode='a', to_stdout=True) # dataset = 'se14r' dataset = 'se15r' n_labels = 5 dataset_files = config.DATA_FILES[dataset] n_train, data_valid = bldatautils.load_train_data_bert_ol( dataset_files['train_sents_file'], dataset_files['train_valid_split_file'], dataset_files['bert_valid_tokens_file']) data_test = bldatautils.load_test_data_bert_ol( dataset_files['test_sents_file'], dataset_files['bert_test_tokens_file']) bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE) bm = robert.Robert(bert_config, n_labels=n_labels, seq_length=config.BERT_SEQ_LEN, is_train=False, init_checkpoint=dataset_files['bert_init_checkpoint']) lstmcrf = BertLSTMCRF(n_labels, config.BERT_EMBED_DIM, hidden_size_lstm=500, batch_size=5) lstmcrf.train_ol(robert_model=bm, train_tfrec_file=dataset_files['train_tfrecord_file'], valid_tfrec_file=dataset_files['valid_tfrecord_file'], test_tfrec_file=dataset_files['test_tfrecord_file'], seq_length=config.BERT_SEQ_LEN, n_train=n_train, data_valid=data_valid, data_test=data_test)