def _write_lm_encodings(lm_data_shard_cfg, device): data = get_data( word_emb_data_path_prefix=GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX, tokenized_trn_json_path=TOKENIZED_TRN_JSON_PATH, tokenized_dev_json_path=TOKENIZED_DEV_JSON_PATH, max_ans_len=MAX_ANS_LEN, max_ctx_len=MAX_CTX_LEN) write_lm_data_shard(data, lm_data_shard_cfg, device)
def _main(config, config_idx): base_filename = config.name + '_cfg' + str(config_idx) logger = set_up_logger('logs/' + base_filename + '.log') title = '{}: {} ({}) config index {}'.format(__file__, config.name, config.desc, config_idx) logger.info('START ' + title + '\n\n{}\n'.format(config)) data = get_data(config) if config.device != 'cpu': assert 'theano' not in sys.modules import theano.sandbox.cuda theano.sandbox.cuda.use(config.device) from model import get_model model = get_model(config, data) if not config.is_train: if config.tst_load_model_path and not model.load_if_exists( config.tst_load_model_path): raise AssertionError('Failed loading model weights from {}'.format( config.tst_load_model_path)) ans_hats = _tst_epoch(config, model, data) write_test_predictions(ans_hats, config.tst_prd_json_path) logger.info('END ' + title) return # Training loop epoch_results = [] max_em = -np.inf max_f1 = -np.inf np_rng = np.random.RandomState(config.seed // 2) for epoch in range(1, config.max_num_epochs + 1): trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch( config, model, data, epoch, np_rng) dev_min_loss, dev_prx_loss, dev_max_acc, dev_prx_acc, dev_em, dev_f1 = _dev_epoch( config, model, data) if dev_em > max_em: model.save('models/' + base_filename + '_best_em.pkl') max_em = dev_em if dev_f1 > max_f1: model.save('models/' + base_filename + '_best_f1.pkl') max_f1 = dev_f1 if epoch % 5 == 0: model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch)) epoch_results.append( EpochResult(trn_loss, trn_acc, dev_min_loss, dev_prx_loss, dev_max_acc, dev_prx_acc, dev_em, dev_f1)) if config.plot: plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png') logger.info( '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n' .format(epoch, config_idx, trn_samples_per_sec, config.format_compared(), format_epoch_results(epoch_results))) logger.info('END ' + title)
def __init__(self, args, logger, out_dir): self.args = args self.logger = logger self.out_dir = out_dir self.vocab, train_x, test_x, self.overall_maxlen = dataset.get_data( self.args.domain, vocab_size=self.args.vocab_size, maxlen=self.args.maxlen) self.train_x = sequence.pad_sequences(train_x, maxlen=self.overall_maxlen) self.test_x = sequence.pad_sequences(test_x, maxlen=self.overall_maxlen) self.vis_path = self.out_dir + "/visualization" U.mkdir_p(self.vis_path)
def _main(config, config_idx, train): base_filename = config.name + '_cfg' + str(config_idx) logger = set_up_logger('logs/' + base_filename + '.log') title = '{}: {} ({}) config index {}'.format(__file__, config.name, config.desc, config_idx) logger.info('START ' + title + '\n\n{}\n'.format(config)) data = get_data(config, train) if config.device != 'cpu': assert 'theano' not in sys.modules import theano.sandbox.cuda theano.sandbox.cuda.use(config.device) from model import get_model model = get_model(config, data) if not train: assert config.tst_load_model_path if not model.load(config.tst_load_model_path): raise AssertionError('Failed loading model weights from {}'.format(config.tst_load_model_path)) ans_hats = _tst_epoch(config, model, data) write_test_predictions(ans_hats, config.pred_json_path) logger.info('END ' + title) return # Training loop epoch_results = [] max_em = -np.inf max_f1 = -np.inf np_rng = np.random.RandomState(config.seed // 2) for epoch in range(1, config.max_num_epochs+1): trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(config, model, data, epoch, np_rng) dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data) if dev_em > max_em: model.save('models/' + base_filename + '_best_em.pkl') max_em = dev_em if dev_f1 > max_f1: model.save('models/' + base_filename + '_best_f1.pkl') max_f1 = dev_f1 if config.save_freq and epoch % config.save_freq == 0: model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch)) epoch_results.append( EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1)) if config.plot: plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png') logger.info('\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'.format( epoch, config_idx, trn_samples_per_sec, config.format_compared(), format_epoch_results(epoch_results))) logger.info('END ' + title)
def prepare_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \ to_lower=True, sort_by_len=False, vocab_path=None, score_index=3): assert len(datapaths) == 3, "data paths should include train, dev and test path" (train_x, train_y), (dev_x, dev_y), (test_x, test_y), vocab, overal_maxlen, overal_maxnum = \ reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6) X_train, y_train, mask_train = padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True) X_dev, y_dev, mask_dev = padding_sentence_sequences(dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True) X_test, y_test, mask_test = padding_sentence_sequences(test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True) train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) dev_mean = y_dev.mean(axis=0) dev_std = y_dev.std(axis=0) test_mean = y_test.mean(axis=0) test_std = y_test.std(axis=0) Y_train = reader.get_model_friendly_scores(y_train, prompt_id) Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id) Y_test = reader.get_model_friendly_scores(y_test, prompt_id) scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id) logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' dev X shape: ' + str(X_dev.shape)) logger.info(' test X shape: ' + str(X_test.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' dev Y shape: ' + str(Y_dev.shape)) logger.info(' test Y shape: ' + str(Y_test.shape)) logger.info(' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) if embedding_path: embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None return (X_train, Y_train, mask_train), (X_dev, Y_dev, mask_dev), (X_test, Y_test, mask_test), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
def main(): # Get the arguments from the command-line except the filename argv = sys.argv[1:] params = get_params(argv) params['data_params'], type_dict = reader.get_data_settings(params['config_file']) header, raw_data = reader.get_data(params) type_selector, converters = reader.validate_data_format(params['data_params'], header) key_fails, failed_rows, surveys, out_keys = data_processor.process_rows(raw_data, header, params, converters) if 'logfile' in params: with open(params['logfile'], 'w') as logfile: now = datetime.datetime.now() logfile.writelines(['Completed: ' + now.strftime("%B %d, %Y") + '\n']) logfile.writelines(['Total of %s rows were not completed' % len(failed_rows) + '\n']) for key in key_fails.keys(): logfile.writelines([str(key_fails[key]) + ' ' + str(key) + ' key(s) could not be read' + '\n']) logfile.writelines(['%s surveys read, %s surveys failed' % (len(raw_data), len(failed_rows)) + '\n']) logfile.writelines(['%s percent success rate' % str( round((len(raw_data) - len(failed_rows)) / len(raw_data) * 100, 2)) + '\n']) out_header = header + out_keys writer.write_data(surveys, out_header, params)
def prepare_sentence_data(datapath, vocab_path, embedding_path=None, embedding='glove', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, sort_by_len=False): assert len(datapath) == 1, "data paths should include train, dev and test path" (train_x, train_y, train_prompts), vocab, overal_maxlen, overal_maxnum = reader.get_data(datapath, vocab_path, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False) X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True) train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) ####################################### #这里我改了 prompt_id ->train_prompts ####################################### if prompt_id ==-1: Y_train = reader.get_model_friendly_scores(y_train, train_prompts) scaled_train_mean = Y_train.mean() elif prompt_id!=-1: Y_train = reader.get_model_friendly_scores(y_train, prompt_id) scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) if embedding_path: embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None return (X_train, Y_train, mask_train), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
def main(): # Get the arguments from the command-line except the filename argv = sys.argv[1:] params = get_params(argv) print('Program started, input file: %s' % params['filepath']) if 'debug' in params.keys(): if params['debug'] == '1': print('Setting debug level: Debug') logging.getLogger().setLevel(logging.DEBUG) else: print('Setting debug level: Warnings') logging.getLogger().setLevel(logging.WARNING) else: logging.getLogger().setLevel(logging.INFO) params['data_params'], type_dict = reader.get_data_settings(params['config_file']) header, raw_data = reader.get_data(params) type_selector, converters = reader.validate_data_format(params['data_params'], header) key_fails, failed_rows, surveys,quality_assessment, out_keys,meta, raw_surveys = data_processor.process_rows(raw_data, header, params, converters) if 'logfile' in params: if params['logfile'].endswith('txt'): writer.write_log(logfiler.write_txt_log(params, key_fails, raw_data, failed_rows, meta), params['logfile'], 'Log:') elif params['logfile'].endswith('csv'): writer.write_log(logfiler.create_pbi_log(surveys,quality_assessment, meta['attribute_quality'], meta, failed_rows,params), params['logfile'], ['Key', 'Value']) else: logging.CRITICAL('Logfile not written, wrong file extension provided') out_header = surveys[0].keys() if params['outfile'].endswith('.csv'): writer.write_data(surveys, out_header, params,rounding=9,raw_surveys=raw_surveys) if len(quality_assessment) > 0: writer.write_data(quality_assessment, quality_assessment[0].keys(), params, sub_file='group_qual') if len(meta['attribute_quality']) > 0 and 'logfile' in params: writer.write_data(meta['attribute_quality'], meta['attribute_quality'][0].keys(), params, sub_file='attr_qual') else: logging.critical("Your output filename: %s must end with .csv" % params['outfile'])
"--emb_technique", dest="emb_technique", type=str, metavar='<str>', default='w2v', help="embedding technique (w2v or fasttext)") parser.add_argument("-o", "--output", dest="out_file", type=str, metavar='<str>', help="output file name") args = parser.parse_args() vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size, maxlen=args.maxlen, lang=args.lang) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) optimizer = get_optimizer(args.algorithm) model = create_model(args, overall_maxlen, vocab) model_param = config.model_param_file[args.lang].format( args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on) model.load_weights(model_param) model.compile(optimizer=optimizer, loss=ut.max_margin_loss, metrics=[ut.max_margin_loss])
from reader import get_data from basic_prepro import process, do_tokeinze text, title = get_data('001.pdf') processed = process(text) print(len(do_tokeinze(processed)))
embedding = np.zeros(300,dtype=np.float32) sen_embedding = sen_embedding + embedding return sen_embedding elif embedding_type == 'WORD2VEC': sen_embedding = np.zeros(300,dtype=np.float32) for word in sentence: try: embedding = model_word2vec[word] except: embedding = np.zeros(300,dtype=np.float32) sen_embedding = sen_embedding + embedding return sen_embedding return None x, y , ac_num, ac_words = reader.get_data() x_embedding = [] for node in x: node_embedding = [] for sent in node: temp_sent = sent.split(" ") sent_embedding = sentence_embedding([temp_value.strip() for temp_value in temp_sent],"WORD2VEC") node_embedding.append(sent_embedding) x_embedding.append(node_embedding) x_embedding = np.asarray(x_embedding) #done with the data preperation step x_embedding,y = shuffle_copy(x_embedding,y) evals = [] # 0.63
} #assert args.domain in {'restaurant', 'beer'} if args.seed > 0: np.random.seed(args.seed) # ############################################################################################################################### # ## Prepare data # # from keras.preprocessing import sequence import reader as dataset #vocab, train_x, test_x, overall_maxlen = dataset.get_data(args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen) vocab, train_x, overall_maxlen = dataset.get_data(args.input_path, args.out_dir_path, vocab_size=args.vocab_size, maxlen=args.maxlen) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) #test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen) print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) def sentence_batch_generator(data, batch_size): n_batch = len(data) // batch_size batch_count = 0 np.random.shuffle(data) while True:
def prepare_sentence_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \ to_lower=True, sort_by_len=False, vocab_path=None, score_index=6): assert len( datapaths) == 4, "data paths should include train, dev and test path" (train_x, train_y, train_prompts, train_ids), (dev_x, dev_y, dev_prompts, dev_ids), (test_x, test_y, test_prompts, test_ids), vocab, overal_maxlen, overal_maxnum = \ reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6) train_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(train_x), tokenize_text=True, to_lower=True) dev_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(dev_x), tokenize_text=True, to_lower=True) test_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(test_x), tokenize_text=True, to_lower=True) X_train, y_train, mask_train = utils.padding_sentence_sequences( train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True) X_dev, y_dev, mask_dev = utils.padding_sentence_sequences( dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True) X_test, y_test, mask_test = utils.padding_sentence_sequences( test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True) D_train, mask_d_train = utils.padding_des_sequences(train_d, max_sentnum, overal_maxlen, post_padding=True) D_dev, mask_d_dev = utils.padding_des_sequences(dev_d, max_sentnum, overal_maxlen, post_padding=True) D_test, mask_d_test = utils.padding_des_sequences(test_d, max_sentnum, overal_maxlen, post_padding=True) if prompt_id: train_pmt = np.array(train_prompts, dtype='int32') dev_pmt = np.array(dev_prompts, dtype='int32') test_pmt = np.array(test_prompts, dtype='int32') train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) dev_mean = y_dev.mean(axis=0) dev_std = y_dev.std(axis=0) test_mean = y_test.mean(axis=0) test_std = y_test.std(axis=0) # We need the dev and test sets in the original scale for evaluation # dev_y_org = y_dev.astype(reader.get_ref_dtype()) # test_y_org = y_test.astype(reader.get_ref_dtype()) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) Y_train = reader.get_model_friendly_scores(y_train, prompt_id) Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id) Y_test = reader.get_model_friendly_scores(y_test, prompt_id) scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' dev X shape: ' + str(X_dev.shape)) logger.info(' test X shape: ' + str(X_test.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' dev Y shape: ' + str(Y_dev.shape)) logger.info(' test Y shape: ' + str(Y_test.shape)) logger.info( ' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) if embedding_path: embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict( embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None return (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (X_test, Y_test, D_test ,mask_test, test_ids), \ vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, max_sentnum, scaled_train_mean
import calculator.flow import viewer.team_metrics import viewer.dash import logging import logging.config import config with open("log_config.yaml", "r") as f: log_config = yaml.safe_load(f.read()) logging.config.dictConfig(log_config) logger = logging.getLogger(__name__) logger.info("Starting seshat. Let's do team magic!") config = config.get() projects = [] for source_config in config["input"]: logging.info(f"Reading data for {source_config['name']}") data = reader.get_data(source_config) cycle_data = calculator.flow.cycle_data(data, source_config) team_metrics = viewer.team_metrics.Team_Metrics(cycle_data, source_config) projects.append(team_metrics) dash = viewer.dash.Dash(projects, config) server = dash.server if __name__ == "__main__": dash.run()
def _main(config): base_filename = config.name logger_filename = 'logs/' + base_filename + '.log' logger = set_up_logger(logger_filename) title = '{}: {} ({})'.format(__file__, config.name, config.desc) logger.info('START ' + title + '\n\n{}\n'.format(config)) data = get_data( word_emb_data_path_prefix=GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX, tokenized_trn_json_path=TOKENIZED_TRN_JSON_PATH, tokenized_dev_json_path=TOKENIZED_DEV_JSON_PATH, max_ans_len=MAX_ANS_LEN, max_ctx_len=MAX_CTX_LEN) if config.device != 'cpu': assert 'theano' not in sys.modules import theano.sandbox.cuda theano.sandbox.cuda.use(config.device) from model import get_model model = get_model(config, data) lm_data = get_lm_data(config.lm_layer) if config.mode == 'LM' else None # Training loop epoch_results = [] max_em = -np.inf max_f1 = -np.inf np_rng = np.random.RandomState(config.seed // 2) for epoch in range(1, config.max_num_epochs + 1): trn_loss, trn_acc, trn_samples_per_sec, trn_num_all_samples, trn_num_valid_samples, \ trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm, trn_num_unsafe_samples = \ _trn_epoch(config, model, data, lm_data, epoch, np_rng) dev_loss, dev_acc, dev_em, dev_f1, dev_num_all_samples, dev_num_valid_samples = \ _dev_epoch(config, model, data, lm_data) best_filename = base_filename if dev_em > max_em: model.save('models/' + best_filename + '_best_em.pkl') max_em = dev_em if dev_f1 > max_f1: model.save('models/' + best_filename + '_best_f1.pkl') max_f1 = dev_f1 if config.save_freq and epoch % config.save_freq == 0: model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch)) epoch_results.append( EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1)) if config.plot: plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png') logger.info( ('\n\nEpc {} {}: (smp/sec: {:<.1f})' ' (trn: {}/{}) (dev: {}/{})' ' (grad: avg:{} max:{} min:{}) (low probability predictions:{})' '\n{}\n\nResults:\n{}\n\n').format( epoch, config.name, trn_samples_per_sec, trn_num_valid_samples, trn_num_all_samples, dev_num_valid_samples, dev_num_all_samples, trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm, trn_num_unsafe_samples, config.format_compared(), format_epoch_results(epoch_results))) logger.info('END ' + title)
# ############################################################################################################################### # ## Prepare data # # from keras.preprocessing import sequence import reader as dataset # 有3个方法 # create_vocab # read_dataset # get_data # args = parser.parse_args() 这里保存了所有的参数 # 这一步就会创建vocab vocab, train_x, test_x, overall_maxlen = dataset.get_data( args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen) # 读取已经经过preprocessing的数据. train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen) train_x = train_x[0:3000] # 限制长度进行test print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) def sentence_batch_generator(data, batch_size): n_batch = len(data) // batch_size batch_count = 0 np.random.shuffle(data) while True:
def set_rss(): for key in rss.keys(): if key not in rss_contents.keys(): rss_data = reader.get_data(rss[key]) rss_contents[key] = rss_data
def _main(config, config_idx, train): base_filename = config.name + '_cfg' + str(config_idx) logger = set_up_logger('logs/' + base_filename + '.log') title = '{}: {} ({}) config index {}'.format(__file__, config.name, config.desc, config_idx) logger.info('START ' + title + '\n\n{}\n'.format(config)) data = get_data(config, train) if config.device != 'cpu': assert 'theano' not in sys.modules import theano.sandbox.cuda theano.sandbox.cuda.use(config.device) from model import get_model model = get_model(config, data) if not train: assert config.tst_load_model_path if not model.load(config.tst_load_model_path): raise AssertionError('Failed loading model weights from {}'.format( config.tst_load_model_path)) ans_hats = _tst_epoch(config, model, data) write_test_predictions(ans_hats, config.pred_json_path) logger.info('END ' + title) return # Training loop epoch_results = [] max_em = -np.inf max_f1 = -np.inf epochs_with_no_improvement = 0 np_rng = np.random.RandomState(config.seed // 2) for epoch in range(1, config.max_num_epochs + 1): trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch( config, model, data, epoch, np_rng) dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data) if dev_em > max_em: model.save('models/' + base_filename + '_best_em.pkl') max_em = dev_em # Best EM so far, reset epochs_with_no_improvement epochs_with_no_improvement = 0 if dev_f1 > max_f1: model.save('models/' + base_filename + '_best_f1.pkl') max_f1 = dev_f1 # Best F1 so far, reset epochs_with_no_improvement epochs_with_no_improvement = 0 if dev_em <= max_em and dev_f1 <= max_f1: # Neither dev_em nor dev_f1 are better than max, increment epochs # with no improvement. epochs_with_no_improvement += 1 if config.save_freq and epoch % config.save_freq == 0: model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch)) epoch_results.append( EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1)) if config.plot: plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png') logger.info( '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n' .format(epoch, config_idx, trn_samples_per_sec, config.format_compared(), format_epoch_results(epoch_results))) # Check if we have to do early stopping. if epochs_with_no_improvement > config.patience: logger.info("Patience exceeded.") break logger.info('END ' + title)
'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax' } # assert args.domain in {'restaurant', 'beer'} if args.seed > 0: np.random.seed(args.seed) # ############################################################################################################################### # ## Prepare data # # from keras.preprocessing import sequence import reader as dataset vocab, train_x, overall_maxlen = dataset.get_data(args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen, require_test=False) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) # train_x = train_x[0:30000] print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) def sentence_batch_generator(data, batch_size): n_batch = len(data) // batch_size batch_count = 0 np.random.shuffle(data) while True: if batch_count == n_batch:
anss_val = np.array([_np_ans_word_idxs_to_ans_idx(ans_stt, ans_end, max_ans_len) for \ ans_stt, ans_end in anss], dtype=np.int32) ans_stts_val = anss[:, 0] ans_ends_val = anss[:, 1] gpu_anss = torch.from_numpy(anss_val) gpu_ans_stts = torch.from_numpy(ans_stts_val) gpu_ans_ends = torch.from_numpy(ans_ends_val) return gpu_anss, gpu_ans_stts, gpu_ans_ends config = Config() base_filename = config.name + '_cfg' + str(0) logger = set_up_logger('logs/' + base_filename + '.log') title = '{}: {}'.format(__file__, config.name) logger.info('START ' + title + '\n\n{}\n'.format(config)) data = get_data(config, train=True) emb_val = data.word_emb_data.word_emb # (voc size, emb_dim) first_known_word = data.word_emb_data.first_known_word assert config.emb_dim == emb_val.shape[1] assert first_known_word > 0 emb_val[:first_known_word] = 0 emb = torch.from_numpy(emb_val) #load all the data, train and dev data trn_ctxs, trn_ctx_masks, trn_ctx_lens, trn_qtns, trn_qtn_masks, trn_qtn_lens, trn_qtn_ctx_idxs, trn_anss, trn_ans_stts, trn_ans_ends = _gpu_dataset( 'trn', data.trn, config) dev_ctxs, dev_ctx_masks, dev_ctx_lens, dev_qtns, dev_qtn_masks, dev_qtn_lens, dev_qtn_ctx_idxs, dev_anss, dev_ans_stts, dev_ans_ends = _gpu_dataset( 'dev', data.dev, config) def print_param(mdoel): for name, param in model.state_dict().items():
def prepare_sentence_data( datapaths, embedding_path=None, embedding='word2vec', emb_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, vocab_path=None, score_index=6, need_context=True ): assert len(datapaths) == 3, "data paths should include train, dev and test path" (train_x, train_y, train_prompts, train_text), \ (dev_x, dev_y, dev_prompts, dev_text), \ (test_x, test_y, test_prompts, test_text), \ vocab, overall_maxlen, overall_maxnum = \ reader.get_data( datapaths, prompt_id, vocab_size, tokenize_text, to_lower, vocab_path, score_index) X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overall_maxnum, overall_maxlen, post_padding=True) X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(dev_x, dev_y, overall_maxnum, overall_maxlen, post_padding=True) X_test, y_test, mask_test = utils.padding_sentence_sequences(test_x, test_y, overall_maxnum, overall_maxlen, post_padding=True) if need_context: context, context_len, context_num = reader.get_context(prompt_id, vocab, to_lower) else: # Dummy context context = [[0]] context_len = 1 context_num = 1 train_context = [context] * len(train_x) dev_context = [context] * len(dev_x) test_context = [context] * len(test_x) train_context, _, _ = utils.padding_sentence_sequences(train_context, train_y, context_num, context_len, post_padding=True) dev_context, _, _ = utils.padding_sentence_sequences(dev_context, dev_y, context_num, context_len, post_padding=True) test_context, _, _ = utils.padding_sentence_sequences(test_context, test_y, context_num, context_len, post_padding=True) train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) dev_mean = y_dev.mean(axis=0) dev_std = y_dev.std(axis=0) test_mean = y_test.mean(axis=0) test_std = y_test.std(axis=0) # We need the dev and test sets in the original scale for evaluation # dev_y_org = y_dev.astype(reader.get_ref_dtype()) # test_y_org = y_test.astype(reader.get_ref_dtype()) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) Y_train = utils.get_model_friendly_scores(y_train, prompt_id) Y_dev = utils.get_model_friendly_scores(y_dev, prompt_id) Y_test = utils.get_model_friendly_scores(y_test, prompt_id) scaled_train_mean = utils.get_model_friendly_scores(train_mean, prompt_id) scaled_dev_mean = utils.get_model_friendly_scores(dev_mean, prompt_id) scaled_test_mean = utils.get_model_friendly_scores(test_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' dev X shape: ' + str(X_dev.shape)) logger.info(' test X shape: ' + str(X_test.shape)) if need_context: logger.info(' train context shape: ' + str(train_context.shape)) logger.info(' dev context shape: ' + str(dev_context.shape)) logger.info(' test context shape: ' + str(test_context.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' dev Y shape: ' + str(Y_dev.shape)) logger.info(' test Y shape: ' + str(Y_test.shape)) logger.info(' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) logger.info(' dev_y mean: %s, stdev: %s, dev_y mean after scaling: %s' % (str(dev_mean), str(dev_std), str(scaled_dev_mean))) logger.info(' test_y mean: %s, stdev: %s, test_y mean after scaling: %s' % (str(test_mean), str(test_std), str(scaled_test_mean))) if embedding_path: emb_dict, emb_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, emb_dim) emb_matrix = utils.build_embedding_table(vocab, emb_dict, emb_dim, logger, caseless=True) else: emb_matrix = None return (X_train, Y_train, mask_train, train_context, train_text), \ (X_dev, Y_dev, mask_dev, dev_context, dev_text), \ (X_test, Y_test, mask_test, test_context, test_text), \ vocab, len(vocab), emb_matrix, overall_maxlen, overall_maxnum, scaled_train_mean, context_len, context_num
def train_model_each_cluster(args,cluster_size,embtype): logger.info("Cluster Size: {}".format(cluster_size)) args.aspect_size = cluster_size if args.seed > 0: np.random.seed(args.seed) aspect_file_name = config.aspect_file_name[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, embtype, cluster_size) model_path = config.model_param_file[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on) util.createPath(aspect_file_name) vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size, maxlen=args.maxlen, lang=args.lang) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) optimizer = get_optimizer(args.algorithm) logger.info('Building {} based model for {}'.format(args.emb_technique, args.lang)) model = create_model(args, overall_maxlen, vocab) # freeze the word embedding layer model.get_layer('word_emb').trainable = False model.compile(optimizer=optimizer, loss=util.max_margin_loss, metrics=[util.max_margin_loss]) logger.info("-" * 80) vocab_inv = {} for w, ind in vocab.items(): vocab_inv[ind] = w sen_gen = sentence_batch_generator(train_x, args.batch_size) neg_gen = negative_batch_generator(train_x, args.batch_size, args.neg_size) batches_per_epoch = len(train_x) // args.batch_size min_loss = float('inf') for ii in range(args.epochs): t0 = time() loss, max_margin_loss = 0., 0. for b in tqdm(range(batches_per_epoch)): sen_input = next(sen_gen) neg_input = next(neg_gen) batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input, neg_input], np.ones((args.batch_size, 1))) loss += batch_loss / batches_per_epoch max_margin_loss += batch_max_margin_loss / batches_per_epoch tr_time = time() - t0 if loss < min_loss: min_loss = loss word_emb = K.get_value(model.get_layer('word_emb').embeddings) aspect_emb = K.get_value(model.get_layer('aspect_emb').W) word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True) aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True) aspect_file = open(aspect_file_name, 'wt', encoding='utf-8') model.save(model_path) for ind in range(len(aspect_emb)): desc = aspect_emb[ind] sims = word_emb.dot(desc.T) ordered_words = np.argsort(sims)[::-1] desc_list = [vocab_inv[w] + "|" + str(sims[w]) for w in ordered_words[:50]] # print('Aspect %d:' % ind) # print(desc_list) aspect_file.write('Aspect %d:\n' % ind) aspect_file.write(' '.join(desc_list) + '\n\n') per_cluster_train_loss = loss logger.info('Epoch %d, train: %is' % (ii, tr_time)) logger.info( 'Total loss: %.4f, max_margin_loss: %.4f, ortho_reg: %.4f' % ( loss, max_margin_loss, loss - max_margin_loss)) return per_cluster_train_loss
assert args.algorithm in { 'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax' } # assert args.domain in {'restaurant', 'beer'} if args.seed > 0: np.random.seed(args.seed) # ############################################################################################################################### # ## Prepare data # # from keras.preprocessing import sequence import reader as dataset vocab, train_x, test_x, overall_maxlen = dataset.get_data( args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen) train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen) test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen) # train_x = train_x[0:30000] print('Number of training examples: ', len(train_x)) print('Length of vocab: ', len(vocab)) def sentence_batch_generator(data, batch_size): n_batch = len(data) // batch_size batch_count = 0 np.random.shuffle(data) while True: if batch_count == n_batch:
if args.train: print("Training...") df_train = pd.read_json(args.data_dir + "csqa.train.json") df_dev = pd.read_json(args.data_dir + "csqa.dev.json") if args.random_emb: train_embs = np.random.uniform(-1, 1, (406213, 768)) dev_embs = np.random.uniform(-1, 1, (45076, 768)) else: train_embs = reader.load_embs(args.data_dir + "csqa.train.embeddings.bin") dev_embs = reader.load_embs(args.data_dir + "csqa.dev.embeddings.bin") train_data = reader.get_data( df_train, train_embs, rel2idx, subset=int(config["parameters"]["subset_train"]), shuffle=False, random=args.random_emb, ) dev_data = reader.get_data( df_dev, dev_embs, rel2idx, subset=int(config["parameters"]["subset_dev"]), shuffle=False, random=args.random_emb, ) classifier.train(train_data, dev_data) if args.test: