def compile_all_run_vars(list_dict, iter_var_idxs): """ Grab all the run variables from the specifications file, and aggregate as a complete dictionary of variables to be specified and/or overriden in the four_state_receptor object. Args: list_dict: dictionary containing 5 keys; iter_vars, rel_vars, iter_vars, fixed_vars, params, and run_specs. These are read through read_specs_file(...) function in this module. iter_var_idxs: list of length len(list_dict['iter_vars']) which contains the indices of the iterated variable range at which to evaluate the iterated variables in this run. Returns: vars_to_pass: dictionary whose keys are all variables to be overriden in the four_state_receptor class when initialized. """ vars_to_pass = dict() vars_to_pass = parse_iterated_vars(list_dict['iter_vars'], iter_var_idxs, vars_to_pass) vars_to_pass = parse_relative_vars(list_dict['rel_vars'], list_dict['iter_vars'], vars_to_pass) vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['fixed_vars']) vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['params']) return vars_to_pass
def request_extra_list(self, ): if self.extra_request_path and self.listitems: self.extra_listitems = apis.tmdb_api_request( self.extra_request_path, **self.request_kwparams) self.extra_listitems = self.extra_listitems.get( self.extra_request_key, []) if self.extra_request_key else self.extra_listitems self.listitems[0] = utils.merge_two_dicts(self.extra_listitems, self.listitems[0])
def test(opt, dset, model): dset.set_mode(opt.mode) torch.set_grad_enabled(False) model.eval() valid_loader = DataLoader(dset, batch_size=opt.test_bsz, shuffle=False, collate_fn=pad_collate) qid2preds = {} qid2targets = {} for valid_idx, batch in tqdm(enumerate(valid_loader)): model_inputs, targets, qids = preprocess_inputs(batch, opt.max_sub_l, opt.max_vcpt_l, opt.max_vid_l, device=opt.device) outputs = model(*model_inputs) pred_ids = outputs.data.max(1)[1].cpu().numpy().tolist() cur_qid2preds = {qid: pred for qid, pred in zip(qids, pred_ids)} qid2preds = merge_two_dicts(qid2preds, cur_qid2preds) cur_qid2targets = {qid: target for qid, target in zip(qids, targets)} qid2targets = merge_two_dicts(qid2targets, cur_qid2targets) return qid2preds, qid2targets
def calculate_tuning_curves(data_flag): list_dict = read_specs_file(data_flag) for key in list_dict: exec("%s = list_dict[key]" % key) # Get the iterated variable dimensions iter_vars_dims = [] for iter_var in iter_vars: iter_vars_dims.append(len(iter_vars[iter_var])) it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index']) # Set up array to hold tuning curve curves tuning_curve = sp.zeros( (iter_vars_dims[0], iter_vars_dims[1], params['Nn'], params['Mm'])) # Set array to hold epsilons and Kk2 epsilons = sp.zeros((iter_vars_dims[0], iter_vars_dims[1], params['Mm'])) Kk2s = sp.zeros( (iter_vars_dims[0], iter_vars_dims[1], params['Mm'], params['Nn'])) # Iterate tuning curve calculation over all iterable variables while not it.finished: iter_var_idxs = it.multi_index vars_to_pass = dict() vars_to_pass = parse_iterated_vars(iter_vars, iter_var_idxs, vars_to_pass) vars_to_pass = parse_relative_vars(rel_vars, iter_vars, vars_to_pass) vars_to_pass = merge_two_dicts(vars_to_pass, fixed_vars) vars_to_pass = merge_two_dicts(vars_to_pass, params) # Calculate tuning curve for iN in range(vars_to_pass['Nn']): vars_to_pass['manual_dSs_idxs'] = sp.array([iN]) obj = single_encode_CS(vars_to_pass, run_specs) tuning_curve[iter_var_idxs[0], iter_var_idxs[1], iN, :] = obj.dYy epsilons[it.multi_index] = obj.eps Kk2s[it.multi_index] = obj.Kk2 it.iternext() save_tuning_curve(tuning_curve, epsilons, Kk2s, data_flag)
def show_with_brackets( tournament_name, event, tournament_params=[], ): """Returns tournament meta information along with a list of bracketIds for an event""" tournament = show(tournament_name, tournament_params) brackets = event_brackets(tournament_name, event) return utils.merge_two_dicts(tournament, brackets)
def compile_all_run_vars(list_dict): """ Grab all the run variables from the specifications file, and aggregate as a complete dictionary of variables to be specified and/or overriden in the single_cell_FRET_VA object. Args: list_dict: dictionary containing at least 2 keys; data_vars and est_vars. These are read through read_specs_file() function in this module. Only these two keys are read in this module; other keys may exist, but will be ignored. Returns: vars_to_pass: dictionary whose keys are all variables to be overriden in the single_cell_FRET_VA class when initialized. """ vars_to_pass = dict() vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['data_vars']) vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['est_vars']) return vars_to_pass
def translate_cities(tbl, col): u_cities = tbl[col].unique() print("Distinct number of cities: ", len(u_cities)) print("Translating cities") if os.path.exists('trans_cities.json'): old_translations = json.load(open('trans_cities.json')) new_translations = list(set(u_cities) - old_translations.keys()) print("Number of new translations needed for cities: ", len(new_translations)) trans_cities = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations else: trans_cities = translate_list(u_cities) utils.save_dict_as_json(trans_cities, 'trans_cities') return tbl[col].map(lambda x: trans_cities[x] if x != None else x)
def translate_guests(guests): print("Translating months") guests['membershipMonth'] = guests.membershipMonth.map(lambda x: s.months_translated[x] if x in s.months_translated.keys() else x) guests['linkedAccountVerified'] = guests['linkedAccountVerified'].apply(lambda x: format_verified(x) if x != None else []) veri_set = set() for i in guests['linkedAccountVerified']: veri_set |= set(i) print("Distinct number of verifications: ", len(veri_set)) print("Translating verifications") if os.path.exists('trans_verified.json'): old_translations = json.load(open('trans_verified.json')) new_translations = list(veri_set - old_translations.keys()) print("Number of new translations needed for verifications: ", len(new_translations)) trans_verified = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations else: u_verified = list(veri_set) trans_verified = translate_list(u_verified) utils.save_dict_as_json(trans_verified, 'trans_verified') guests['linkedAccountVerified'] = guests.linkedAccountVerified.map(lambda x: str(trans_verified_list(x, trans_verified)) if x != None else None) u_cities = guests['city'].unique() print("Distinct number of cities: ", len(u_cities)) print("Translating cities") if os.path.exists('trans_cities.json'): old_translations = json.load(open('trans_cities.json')) new_translations = list(set(u_cities) - old_translations.keys()) print("Number of new translations needed for cities: ", len(new_translations)) trans_cities = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations else: trans_cities = translate_list(u_cities) utils.save_dict_as_json(trans_cities, 'trans_cities') guests['city'] = translate_cities(guests, 'city') return guests
def prepareColor(color): additionalInfo = { 'lab': np.asarray( [color['l'] * 255 / 100, color['a'] + 128, color['b'] + 128], np.uint8), 'lastPlayed': False, 'pcX': 0, 'pcY': 0 } return utils.merge_two_dicts(color, additionalInfo)
def parse_sequence_example(example, features_config, truncate_sequence_length=20): # Define how to parse the example context_features = {} features_config_single = features_config['single_features'] for feature_name in features_config_single: context_features[feature_name] = tf.FixedLenFeature([], dtype=get_tf_dtype(features_config_single[feature_name]['dtype'])) sequence_features = {} features_config_sequence = features_config['sequence_features'] for feature_name in features_config_sequence: sequence_features[feature_name] = tf.FixedLenSequenceFeature(shape=[], dtype=get_tf_dtype(features_config_sequence[feature_name]['dtype'])) context_parsed, sequence_parsed = tf.parse_single_sequence_example( example, sequence_features=sequence_features, context_features=context_features, example_name="example" ) #Truncate long sessions to a limit context_parsed['session_size'] = tf.minimum(context_parsed['session_size'], truncate_sequence_length) for feature_name in sequence_parsed: sequence_parsed[feature_name] = sequence_parsed[feature_name][:truncate_sequence_length] #Ignoring first click from labels sequence_parsed['label_next_item'] = sequence_parsed['item_clicked'][1:] #Making it easy to retrieve the last label sequence_parsed['label_last_item'] = sequence_parsed['item_clicked'][-1:] #Ignoring last clicked item from input for feature_key in sequence_features: if feature_key not in ['label_next_item', 'label_last_item']: sequence_parsed[feature_key] = sequence_parsed[feature_key][:-1] merged_features = merge_two_dicts(context_parsed, sequence_parsed) #In order the pad the dataset, I had to use this hack to expand scalars to vectors. merged_expanded_features = expand_single_features(merged_features, features_to_expand=list(features_config['single_features'].keys())) return merged_expanded_features
def get_cached_data(self, item=None, tmdb_type=None): if tmdb_type and item: if item.get('show_id') or item.get('id'): if item.get('show_id'): my_id = item.get('show_id') my_request = 'tv' elif item.get('id'): my_id = item.get('id') my_request = tmdb_type request_path = '{0}/{1}'.format(my_request, my_id) kwparams = {} kwparams['append_to_response'] = APPEND_TO_RESPONSE self.detailed_info = apis.tmdb_api_only_cached( request_path, **kwparams) if self.detailed_info: item = utils.merge_two_dicts(self.detailed_info, item) if item.get('imdb_id') and my_request in ['movie', 'tv']: self.omdb_info = apis.omdb_api_only_cached( i=item.get('imdb_id'))
def get_possible_drags(player_1_board, player_2_board, player): """ Returns the set of possible drags given the board state and current player. Parameter player can be either 1 or 2 of int dtype. A possible drag has a tuple having two tuples. First tuple is from-position and second tuple is to-position. """ possible_drags = set() # set current_player_board according to the player parameter if player == 1: current_player_board = player_1_board else: current_player_board = player_2_board for i in current_player_board.values(): for j in VALID_MOVES[POSITIONS_TO_INDEX[i]]: if INDEX_TO_POSITIONS[j] not in merge_two_dicts( player_1_board, player_2_board).values(): possible_drags.add((i, INDEX_TO_POSITIONS[j])) return possible_drags
if (len(sys.argv) <= 3) and args.machine_name: command += " -F " + args.machine_name return command #starting machine already existing in current lab if args.hostlab: if (os.path.exists(os.path.join(args.hostlab, "lab.conf"))): (machines, links, options, metadata) = nc.lab_parse(args.hostlab) if machines.get(args.machine_name) != None: #creating and updating interfaces in lab.conf conf_lines = {} if (args.eths != None): new_eths = eths_line_writer(args.eths) conf_lines = u.merge_two_dicts( u.couple_list_to_dict(machines[args.machine_name]), new_eths) else: conf_lines = u.couple_list_to_dict(machines[args.machine_name]) conf_lines = conf_line_writer(conf_lines) create_lab(machine_path, args.machine_name, conf_lines) #copying and appending commands to startup file startup_path = os.path.join(args.hostlab, args.machine_name + ".startup") if (os.path.exists(startup_path)): shutil.copy( startup_path, os.path.join(machine_path, args.machine_name + ".startup")) if (args.exe != None):
def main(): #global n_words # Prepare training and testing data opt = COptions(args) opt_t = COptions(args) loadpath = (opt.data_dir + "/" + opt.data_name) print "loadpath:" + loadpath x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] wordtoix, ixtoword = x[3], x[4] if opt.test: test_file = opt.data_dir + "/newdata2/test.txt" test = read_test(test_file, wordtoix) test = [ x for x in test if all( [2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)]) ] train_filtered = [ x for x in train if all([2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)]) ] val_filtered = [ x for x in val if all([2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)]) ] print("Train: %d => %d" % (len(train), len(train_filtered))) print("Val: %d => %d" % (len(val), len(val_filtered))) train, val = train_filtered, val_filtered del train_filtered, val_filtered opt.n_words = len(ixtoword) opt_t.n_words = len(ixtoword) opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1 opt_t.update_params(args) print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") print dict(opt) print('Total words: %d' % opt.n_words) for d in ['/gpu:0']: with tf.device(d): src_ = [ tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) for _ in range(opt.n_context) ] tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len]) is_train_ = tf.placeholder(tf.bool, name='is_train') res_, gan_cost_g_, train_op_g = conditional_s2s( src_, tgt_, is_train_, opt, opt_t) merged = tf.summary.merge_all() uidx = 0 graph_options = tf.GraphOptions(build_cost_model=1) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, graph_options=graph_options) config.gpu_options.per_process_gpu_memory_fraction = 0.90 np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() run_metadata = tf.RunMetadata() with tf.Session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() if opt.load_from_pretrain: d_vars = [ var for var in t_vars if var.name.startswith('d_') ] g_vars = [ var for var in t_vars if var.name.startswith('g_') ] l_vars = [ var for var in t_vars if var.name.startswith('l_') ] restore_from_save(d_vars, sess, opt, load_path=opt.restore_dir + "/save/" + opt.global_d) if opt.local_feature: restore_from_save(l_vars, sess, opt, load_path=opt.restore_dir + "/save/" + opt.local_d) else: loader = restore_from_save(t_vars, sess, opt, load_path=opt.save_path) except Exception as e: print 'Error: ' + str(e) print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) loss_d, loss_g = 0, 0 if opt.test: iter_num = np.int(np.floor(len(test) / opt.batch_size)) + 1 res_all = [] for i in range(iter_num): test_index = range(i * opt.batch_size, (i + 1) * opt.batch_size) sents = [val[t] for t in test_index] for idx in range(opt.n_context, opt.num_turn): src = [[ sents[i][idx - turn] for i in range(opt.batch_size) ] for turn in range(opt.n_context, 0, -1)] tgt = [sents[i][idx] for i in range(opt.batch_size)] x_batch = [ prepare_data_for_cnn(src_i, opt) for src_i in src ] # Batch L y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO=False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, { tgt_: y_batch, is_train_: 0 }) # do not use False res = sess.run(res_, feed_dict=feed) res_all.extend(res['syn_sent']) # bp() res_all = reshaping(res_all, opt) for idx in range(len(test) * (opt.num_turn - opt.n_context)): with open(opt.log_path + '.resp.txt', "a") as resp_f: resp_f.write(u' '.join([ ixtoword[x] for x in res_all[idx] if x != 0 and x != 2 ]).encode('utf-8').strip() + ( '\n' if idx % (opt.num_turn - opt.n_context) == 0 else '\t')) print("save to:" + opt.log_path + '.resp.txt') exit(0) for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) for _, train_index in kf: uidx += 1 sents = [train[t] for t in train_index] for idx in range(opt.n_context, opt.num_turn): src = [[ sents[i][idx - turn] for i in range(opt.batch_size) ] for turn in range(opt.n_context, 0, -1)] tgt = [sents[i][idx] for i in range(opt.batch_size)] x_batch = [ prepare_data_for_cnn(src_i, opt) for src_i in src ] # Batch L y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO=False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, { tgt_: y_batch, is_train_: 1 }) _, loss_g = sess.run([train_op_g, gan_cost_g_], feed_dict=feed) if uidx % opt.print_freq == 0: print("Iteration %d: loss G %f" % (uidx, loss_g)) res = sess.run(res_, feed_dict=feed) if opt.global_feature: print "z loss: " + str(res['z_loss']) if "nn" in opt.agg_model: print "z pred_loss: " + str(res['z_loss_pred']) print "Source:" + u' '.join( [ixtoword[x] for s in x_batch for x in s[0] if x != 0]).encode('utf-8').strip() print "Target:" + u' '.join([ ixtoword[x] for x in y_batch[0] if x != 0 ]).encode('utf-8').strip() print "Generated:" + u' '.join([ ixtoword[x] for x in res['syn_sent'][0] if x != 0 ]).encode('utf-8').strip() print "" sys.stdout.flush() summary = sess.run(merged, feed_dict=feed) train_writer.add_summary(summary, uidx) if uidx % opt.valid_freq == 1: VALID_SIZE = 4096 valid_multiplier = np.int( np.floor(VALID_SIZE / opt.batch_size)) res_all, val_tgt_all, loss_val_g_all = [], [], [] if opt.global_feature: z_loss_all = [] for val_step in range(valid_multiplier): valid_index = np.random.choice(len(val), opt.batch_size) sents = [val[t] for t in valid_index] for idx in range(opt.n_context, opt.num_turn): src = [[ sents[i][idx - turn] for i in range(opt.batch_size) ] for turn in range(opt.n_context, 0, -1)] tgt = [ sents[i][idx] for i in range(opt.batch_size) ] val_tgt_all.extend(tgt) x_batch = [ prepare_data_for_cnn(src_i, opt) for src_i in src ] # Batch L y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO=False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, { tgt_: y_batch, is_train_: 0 }) # do not use False loss_val_g = sess.run([gan_cost_g_], feed_dict=feed) loss_val_g_all.append(loss_val_g) res = sess.run(res_, feed_dict=feed) res_all.extend(res['syn_sent']) if opt.global_feature: z_loss_all.append(res['z_loss']) print("Validation: loss G %f " % (np.mean(loss_val_g_all))) if opt.global_feature: print "z loss: " + str(np.mean(z_loss_all)) print "Val Source:" + u' '.join( [ixtoword[x] for s in x_batch for x in s[0] if x != 0]).encode('utf-8').strip() print "Val Target:" + u' '.join([ ixtoword[x] for x in y_batch[0] if x != 0 ]).encode('utf-8').strip() print "Val Generated:" + u' '.join([ ixtoword[x] for x in res['syn_sent'][0] if x != 0 ]).encode('utf-8').strip() print "" if opt.global_feature: with open(opt.log_path + '.z.txt', "a") as myfile: myfile.write("Iteration" + str(uidx) + "\n") myfile.write("z_loss %f" % (np.mean(z_loss_all)) + "\n") myfile.write("Val Source:" + u' '.join([ ixtoword[x] for s in x_batch for x in s[0] if x != 0 ]).encode('utf-8').strip() + "\n") myfile.write("Val Target:" + u' '.join( [ixtoword[x] for x in y_batch[0] if x != 0]).encode('utf-8').strip() + "\n") myfile.write("Val Generated:" + u' '.join([ ixtoword[x] for x in res['syn_sent'][0] if x != 0 ]).encode('utf-8').strip() + "\n") myfile.write("Z_input, Z_recon, Z_tgt") myfile.write( np.array2string(res['z'][0], formatter={ 'float_kind': lambda x: "%.2f" % x }) + "\n") myfile.write( np.array2string(res['z_hat'][0], formatter={ 'float_kind': lambda x: "%.2f" % x }) + "\n\n") myfile.write( np.array2string(res['z_tgt'][0], formatter={ 'float_kind': lambda x: "%.2f" % x }) + "\n\n") val_set = [prepare_for_bleu(s) for s in val_tgt_all] gen = [prepare_for_bleu(s) for s in res_all] [bleu1s, bleu2s, bleu3s, bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus=opt.is_corpus) etp_score, dist_score = cal_entropy(gen) print 'Val BLEU: ' + ' '.join([ str(round(it, 3)) for it in (bleu1s, bleu2s, bleu3s, bleu4s) ]) print 'Val Entropy: ' + ' '.join([ str(round(it, 3)) for it in (etp_score[0], etp_score[1], etp_score[2], etp_score[3]) ]) print 'Val Diversity: ' + ' '.join([ str(round(it, 3)) for it in (dist_score[0], dist_score[1], dist_score[2], dist_score[3]) ]) print 'Val Avg. length: ' + str( round( np.mean([ len([y for y in x if y != 0]) for x in res_all ]), 3)) print "" summary = sess.run(merged, feed_dict=feed) summary2 = tf.Summary(value=[ tf.Summary.Value(tag="bleu-2", simple_value=bleu2s), tf.Summary.Value(tag="etp-4", simple_value=etp_score[3]) ]) test_writer.add_summary(summary, uidx) test_writer.add_summary(summary2, uidx) if uidx % opt.save_freq == 0: saver.save(sess, opt.save_path)
def main(): #global n_words # Prepare training and testing data opt = COptions(args) opt_t = COptions(args) # opt_t.n_hid = opt.n_z loadpath = (opt.data_dir + "/" + opt.data_name) #if opt.not_philly else '/hdfs/msrlabs/xiag/pt-data/cons/data_cleaned/twitter_small.p' print "loadpath:" + loadpath x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] wordtoix, ixtoword = x[3], x[4] if opt.test: test_file = opt.data_dir + opt.test_file test = read_test(test_file, wordtoix) # test = [ x for x in test if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])] # train_filtered = [ x for x in train if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])] # val_filtered = [ x for x in val if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])] # print ("Train: %d => %d" % (len(train), len(train_filtered))) # print ("Val: %d => %d" % (len(val), len(val_filtered))) # train, val = train_filtered, val_filtered # del train_filtered, val_filtered opt.n_words = len(ixtoword) opt_t.n_words = len(ixtoword) opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1 opt_t.update_params(args) print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") print dict(opt) print('Total words: %d' % opt.n_words) # print dict(opt) # if opt.model == 'cnn_rnn': # opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1 # opt_t.update_params(args) # print dict(opt_t) #for d in ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']: for d in ['/gpu:0']: with tf.device(d): src_ = [tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) for _ in range(opt.n_context)] tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len]) z_ = tf.placeholder(tf.float32, shape=[opt_t.batch_size , opt.n_z * (2 if opt.local_feature else 1)]) is_train_ = tf.placeholder(tf.bool, name = 'is_train') res_1_ = get_features(src_, tgt_, is_train_, opt, opt_t) res_2_ = generate_resp(src_, tgt_, z_, is_train_, opt, opt_t) merged = tf.summary.merge_all() #tensorboard --logdir=run1:/tmp/tensorflow/ --port 6006 #writer = tf.train.SummaryWriter(opt.log_path, graph=tf.get_default_graph()) uidx = 0 graph_options=tf.GraphOptions(build_cost_model=1) #config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=tf.GraphOptions(build_cost_model=1)) config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=graph_options) # config.gpu_options.per_process_gpu_memory_fraction = 0.70 #config = tf.ConfigProto(device_count={'GPU':0}) #config.gpu_options.allow_growth = True np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() run_metadata = tf.RunMetadata() with tf.Session(config = config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: #pdb.set_trace() t_vars = tf.trainable_variables() #t_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #tf.trainable_variables() # if opt.load_from_pretrain: # d_vars = [var for var in t_vars if var.name.startswith('d_')] # g_vars = [var for var in t_vars if var.name.startswith('g_')] # g_vars = [var for var in t_vars if var.name.startswith('g_')] # g_vars = [var for var in t_vars if var.name.startswith('g_')] # g_vars = [var for var in t_vars if var.name.startswith('g_')] # g_vars = [var for var in t_vars if var.name.startswith('g_')] # l_vars = [var for var in t_vars if var.name.startswith('l_')] # #restore_from_save(g_vars, sess, opt, prefix = 'g_', load_path=opt.restore_dir + "/save/generator2") # restore_from_save(d_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.global_d) # if opt.local_feature: # restore_from_save(l_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.local_d) # else: loader = restore_from_save(t_vars, sess, opt, load_path = opt.save_path) except Exception as e: print 'Error: '+str(e) print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) loss_d , loss_g = 0, 0 if opt.test: iter_num = np.int(np.floor(len(test)/opt.batch_size))+1 res_all = [] val_tgt_all =[] for i in range(iter_num): test_index = range(i * opt.batch_size,(i+1) * opt.batch_size) sents = [test[t%len(test)] for t in test_index] for idx in range(opt.n_context,opt.num_turn): src = [[sents[i][idx-turn] for i in range(opt.batch_size)] for turn in range(opt.n_context,0,-1)] tgt = [sents[i][idx] for i in range(opt.batch_size)] val_tgt_all.extend(tgt) if opt.feed_generated and idx!= opt.n_context: src[-1] = [[x for x in p if x!=0] for p in res_all[-opt.batch_size:]] x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] # Batch L y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) # do not use False res_1 = sess.run(res_1_, feed_dict=feed) z_all = np.array(res_1['z']) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, z_: z_all, is_train_: 0}) # do not use False res_2 = sess.run(res_2_, feed_dict=feed) res_all.extend(res_2['syn_sent']) # bp() val_tgt_all = reshaping(val_tgt_all, opt) res_all = reshaping(res_all, opt) save_path = opt.log_path + '.resp.txt' if os.path.exists(save_path): os.remove(save_path) for idx in range(len(test)*(opt.num_turn-opt.n_context)): with open(save_path, "a") as resp_f: resp_f.write(u' '.join([ixtoword[x] for x in res_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') ) print ("save to:" + save_path) if opt.verbose: save_path = opt.log_path + '.tgt.txt' if os.path.exists(save_path): os.remove(save_path) for idx in range(len(test)*(opt.num_turn-opt.n_context)): with open(save_path, "a") as tgt_f: tgt_f.write(u' '.join([ixtoword[x] for x in val_tgt_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') ) print ("save to:" + save_path) val_set = [prepare_for_bleu(s) for s in val_tgt_all] gen = [prepare_for_bleu(s) for s in res_all] [bleu1s,bleu2s,bleu3s,bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus = opt.is_corpus) etp_score, dist_score = cal_entropy(gen) # print save_path print 'Val BLEU: ' + ' '.join([str(round(it,3)) for it in (bleu1s,bleu2s,bleu3s,bleu4s)]) # print 'Val Rouge: ' + ' '.join([str(round(it,3)) for it in (rouge1,rouge2,rouge3,rouge4)]) print 'Val Entropy: ' + ' '.join([str(round(it,3)) for it in (etp_score[0],etp_score[1],etp_score[2],etp_score[3])]) print 'Val Diversity: ' + ' '.join([str(round(it,3)) for it in (dist_score[0],dist_score[1],dist_score[2],dist_score[3])]) # print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])]) print 'Val Avg. length: ' + str(round(np.mean([len([y for y in x if y!=0]) for x in res_all]),3)) if opt.embedding_score: with open("../../ssd0/consistent_dialog/data/GoogleNews-vectors-negative300.bin.p", 'rb') as pfile: embedding = cPickle.load(pfile) rel_score = cal_relevance(gen, val_set, embedding) print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])]) if not opt.global_feature or opt.bit == None: exit(0) if opt.test: iter_num = np.int(np.floor(len(test)/opt.batch_size))+1 for int_idx in range(opt.int_num): res_all = [] z1,z2,z3 = [],[],[] val_tgt_all =[] for i in range(iter_num): test_index = range(i * opt.batch_size,(i+1) * opt.batch_size) sents = [test[t%len(test)] for t in test_index] for idx in range(opt.n_context,opt.num_turn): src = [[sents[i][idx-turn] for i in range(opt.batch_size)] for turn in range(opt.n_context,0,-1)] tgt = [sents[i][idx] for i in range(opt.batch_size)] val_tgt_all.extend(tgt) if opt.feed_generated and idx!= opt.n_context: src[-1] = [[x for x in p if x!=0] for p in res_all[-opt.batch_size:]] x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] # Batch L y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) # do not use False res_1 = sess.run(res_1_, feed_dict=feed) z_all = np.array(res_1['z']) z_all[:,opt.bit] = np.array([1.0/np.float(opt.int_num-1) * int_idx for _ in range(opt.batch_size)]) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, z_: z_all, is_train_: 0}) # do not use False res_2 = sess.run(res_2_, feed_dict=feed) res_all.extend(res_2['syn_sent']) z1.extend(res_1['z']) z2.extend(z_all) z3.extend(res_2['z_hat']) # bp() val_tgt_all = reshaping(val_tgt_all, opt) res_all = reshaping(res_all, opt) z1 = reshaping(z1, opt) z2 = reshaping(z2, opt) z3 = reshaping(z3, opt) save_path = opt.log_path + 'bit' + str(opt.bit) + '.'+ str(1.0/np.float(opt.int_num-1) * int_idx) +'.int.txt' if os.path.exists(save_path): os.remove(save_path) for idx in range(len(test)*(opt.num_turn-opt.n_context)): with open(save_path, "a") as resp_f: resp_f.write(u' '.join([ixtoword[x] for x in res_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') ) print ("save to:" + save_path) save_path_z = opt.log_path + 'bit' + str(opt.bit) + '.'+ str(1.0/np.float(opt.int_num-1) * int_idx) +'.z.txt' if os.path.exists(save_path_z): os.remove(save_path_z) for idx in range(len(test)*(opt.num_turn-opt.n_context)): with open(save_path_z, "a") as myfile: #ary = np.array([z1[idx][opt.bit], z2[idx][opt.bit], z3[idx][opt.bit]]) #myfile.write(np.array2string(ary, formatter={'float_kind':lambda x: "%.2f" % x}) + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t')) myfile.write(str(z3[idx][opt.bit]) + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t')) val_set = [prepare_for_bleu(s) for s in val_tgt_all] gen = [prepare_for_bleu(s) for s in res_all] [bleu1s,bleu2s,bleu3s,bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus = opt.is_corpus) etp_score, dist_score = cal_entropy(gen) print save_path print 'Val BLEU: ' + ' '.join([str(round(it,3)) for it in (bleu1s,bleu2s,bleu3s,bleu4s)]) # print 'Val Rouge: ' + ' '.join([str(round(it,3)) for it in (rouge1,rouge2,rouge3,rouge4)]) print 'Val Entropy: ' + ' '.join([str(round(it,3)) for it in (etp_score[0],etp_score[1],etp_score[2],etp_score[3])]) print 'Val Diversity: ' + ' '.join([str(round(it,3)) for it in (dist_score[0],dist_score[1],dist_score[2],dist_score[3])]) # print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])]) print 'Val Avg. length: ' + str(round(np.mean([len([y for y in x if y!=0]) for x in res_all]),3))
parser.add_argument("--model", type=str, required=True, help="model name") parser.add_argument("--config_filename", type=str, default="config", help="the filename of config file") args, _ = parser.parse_known_args() # get config model_config_name = args.model if "lstm" in args.model: model_config_name = "lstm" main_config = parser_config(args.config_filename, "main") model_config = parser_config(args.config_filename, model_config_name) # configs = {**main_config, **model_config} configs = merge_two_dicts(main_config, model_config) configs = Config(configs) texts, labels = read(args.data_path) with open("./tmp_test_content", "r") as f: pred = f.readlines() pred = list(map(lambda x: x.rstrip("\n"), pred)) # vocab = {} # # for tt in texts + pred: # for ttt in tt.split(): # if(ttt not in vocab): # vocab[ttt] = 1
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len, max_doc_len, max_url_len, nb_classes, args): if is_train: vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], [] all_url_list, all_ids_list, all_sim_list = [], [], [] for data_name in datasets: # there can be multiple data sets combined as the train or test data data_folder = "%s/%s" % (path, data_name) print('creating dataset %s' % data_name) t = time.time() q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) url_list, max_url_len_dataset = [], 0 if os.path.exists("%s/url.txt" % data_folder): url_list, max_url_len_dataset = read_urls( "%s/url.txt" % data_folder, vocab, is_train, '3gram') ids_list = read_metadata("%s/id.txt" % data_folder) if is_train: max_query_len['word'] = max(max_query_len['word'], min(max_q1_word_len, MAX_WORD_LENGTH)) max_query_len['3gram'] = max( max_query_len['3gram'], min(max_q1_3gram_len, MAX_3GRAM_LENGTH)) max_doc_len['word'] = max(max_doc_len['word'], min(max_q2_word_len, MAX_WORD_LENGTH)) max_doc_len['3gram'] = max(max_doc_len['3gram'], min(max_q2_3gram_len, MAX_3GRAM_LENGTH)) max_url_len['url'] = max(max_url_len['url'], min(max_url_len_dataset, MAX_URL_LENGTH)) sim_list = read_relevance("%s/sim.txt" % data_folder) categorical_sim_list = np.zeros((len(sim_list), nb_classes), dtype='int') for i, sim in enumerate(sim_list): categorical_sim_list[i][sim] = 1 print(sim_list[:5], categorical_sim_list[:5]) query_word_list.extend(q1_word_list) doc_word_list.extend(q2_word_list) query_3gram_list.extend(q1_3gram_list) doc_3gram_list.extend(q2_3gram_list) all_url_list.extend(url_list) all_ids_list.extend(ids_list) all_sim_list.extend(categorical_sim_list) print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" % (max_q1_word_len, max_q2_word_len, max_query_len['word'], max_doc_len['word'])) print( "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" % (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'], max_doc_len['3gram'])) print('max_url_len: %d, limit: %d' % (max_url_len_dataset, max_url_len['url'])) print('creating dataset done: %d' % (time.time() - t)) # question padding data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)} data['query_word_input'] = pad_sequences(query_word_list, maxlen=max_query_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_word_mask'] = create_masks(data['query_word_input'], args) data['doc_word_input'] = pad_sequences(doc_word_list, maxlen=max_doc_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_word_mask'] = create_masks(data['doc_word_input'], args) data['query_3gram_input'] = pad_sequences(query_3gram_list, maxlen=max_query_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args) data['doc_3gram_input'] = pad_sequences(doc_3gram_list, maxlen=max_doc_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args) data['url_3gram_input'] = pad_sequences(all_url_list, maxlen=max_url_len['url'], value=PAD_WORD_INDEX, padding='post', truncating='pre') data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args) if os.path.exists("%s/collection_ngram_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_ngram_idf.json" % path, "r")) vocab_inv = invert_dict(vocab['3gram']) data['query_3gram_weight'] = inject_ngram_weight( data['query_3gram_input'], vocab_inv, weights) data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'], vocab_inv, weights) data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'], vocab_inv, weights) print('ngram weight injection done: %d' % (time.time() - t)) else: num_samples, max_query_len = data['query_3gram_input'].shape data['query_3gram_weight'] = np.ones( (num_samples, ATTENTION_DEEP_LEVEL, max_query_len)) data['doc_3gram_weight'] = np.ones((num_samples, ATTENTION_DEEP_LEVEL, data['doc_3gram_input'].shape[1])) if os.path.exists("%s/collection_word_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_word_idf.json" % path, "r")) merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word']) vocab_inv = invert_dict(merge_vocab) print('inject query IDF weights') data['query_word_weight'] = inject_word_weight( data['query_word_input'], vocab_inv, weights) print('inject doc IDF weights') data['doc_word_weight'] = inject_word_weight(data['doc_word_input'], vocab_inv, weights) data['overlap_feat'] = compute_overlap_feat(data['query_word_input'], data['doc_word_input'], vocab_inv, weights) print('word weight injection done: %d' % (time.time() - t)) return data
def main(options): args = get_default_args() set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013'] train_set.remove(test_name) else: train_set = [train_name] test_set = [test_name] print("train_set", train_set) print("test_set", test_set) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab) #print('build vocab done. %d' % len(vocab['word'])) train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if mode == 'dssm': train_dataset = convert_data_to_dssm_format(train_dataset, vocab, is_train_or_val=True) test_dataset = convert_data_to_dssm_format(test_dataset, vocab, is_train_or_val=False) print('data convertion done!') val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val(train_set, num_samples=num_samples, val_split=val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) print(val_indices[:5], np.sum(np.array(val_indices))) # sample validation set for debug purpose # val_indices = val_indices[:100] train_dataset["query_word_weight"] = train_dataset[ "query_word_weight"][:, :args['deeplevel']] train_dataset["query_3gram_weight"] = train_dataset[ "query_3gram_weight"][:, :args['deeplevel']] train_dataset["doc_word_weight"] = train_dataset[ "doc_word_weight"][:, :args['deeplevel']] train_dataset["doc_3gram_weight"] = train_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] train_dataset["url_3gram_weight"] = train_dataset[ "url_3gram_weight"][:, :args['deeplevel']] test_dataset["query_word_weight"] = test_dataset[ "query_word_weight"][:, :args['deeplevel']] test_dataset["query_3gram_weight"] = test_dataset[ "query_3gram_weight"][:, :args['deeplevel']] test_dataset["doc_word_weight"] = test_dataset[ "doc_word_weight"][:, :args['deeplevel']] test_dataset["doc_3gram_weight"] = test_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] test_dataset["url_3gram_weight"] = test_dataset[ "url_3gram_weight"][:, :args['deeplevel']] # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100]))) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # sample training dataset for debug purpose # sample_num = 1000 # for key in train_dataset: # train_dataset[key] = train_dataset[key][:sample_num] # merge the vocabulory of train and test set print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" % (len(vocab['word']), len(vocab['3gram']), len(vocab['url']))) print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len( test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url']))) merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d) url(%d)" % (len(merged_vocab['word']), len( merged_vocab['3gram']), len(merged_vocab['url']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose # print_dataset(mode, train_dataset, vocab_inv) # print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) elif mode == 'dssm': model = create_dssm_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"]) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) model_weights, parameter_num = get_model_weights(model) print('model init weights sum: {} of {} parameters'.format( model_weights, parameter_num)) # if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, min_lr=0.0001) fit_mode = "fit" if fit_mode == "fit": model.fit( train_dataset, train_dataset['sim'], # validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) else: train_steps, train_batches = batch_iter( train_dataset, train_dataset["sim"], batch_size=args['batch_size']) valid_steps, valid_batches = batch_iter( val_dataset, val_dataset["sim"], batch_size=args['batch_size']) model.fit_generator( train_batches, train_steps, epochs=args['epochs'], validation_data=valid_batches, validation_steps=valid_steps, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) #plot_model(model, to_file='model.png') ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. if args["only_char"]: merged_vocab_emb = None else: embedding_layer_name = 'word_embedding' trained_vocab_emb = model.get_layer( embedding_layer_name).get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[len(vocab['word']):len(merged_vocab['word'] ), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if not args["only_char"] and layer.name != embedding_layer_name: new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) elif mode == 'dssm': getter = K.function([model.layers[0].input, model.layers[1].input], model.layers[-2].output) print('create DSSM functional getter...') num_samples, _, _ = test_dataset['query_3gram_input'].shape batch_size = 128 num_batch = int(math.ceil(num_samples * 1.0 / batch_size)) predictions = np.zeros((num_samples, )) for i in range(num_batch): start_idx, end_idx = i * batch_size, min(num_samples, (i + 1) * batch_size) predictions[start_idx:end_idx] = getter([ test_dataset['query_3gram_input'][start_idx:end_idx], test_dataset['doc_3gram_input'][start_idx:end_idx] ])[:, 0] #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']]) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def main(options): args = get_default_args() set_args(args, options) mode, dataset_name = args['mode'], args['dataset'] # default setting args['raw_data'] = "data/%s/" % args['dataset'] args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset'] print_args(args) # get train/val/test names for specific dataset train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset( args) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}} test_vocab = {'word': {}, '3gram': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s_%s" % (mode, dataset_name, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset, _, _, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, val_name), False) if args['embedding'] == 'glove': train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"], type=args["embedding"]) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset = gen_data(args["raw_data"], val_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create validation set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': save_data("%s/%s/%s" % (args["experimental_data"], data_name, val_name), False, val_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save val set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if dataset_name == 'twitter' or dataset_name == 'TwitterURL': val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if dataset_name == 'twitter' else False if query_random: del train_dataset["overlap_feat"] val_indices = sample_aaai_val_set(args["raw_data"], train_set, val_split) else: val_split = 0.1 val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: #print(key, train_dataset[key].shape) val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: if train_dataset[key].size == 0: continue keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # merge the vocabulory of train and test set merged_vocab = {} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'], test_vocab['3gram']) print("TRAIN vocab: word(%d) 3gram(%d)" % (len(vocab['word']), len(vocab['3gram']))) print("TEST vocab: word(%d) 3gram(%d)" % (len(test_vocab['word']), len(test_vocab['3gram']))) print("MERGED vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(merged_vocab['3gram']))) vocab_inv, vocab_size = {}, {} for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# # create model model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) model_name = ( "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s" % (mode, train_name, args['model_option'], args["encoder_option"], args['conv_option'], args["nb_filters"], args["nb_layers"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'], args['join'], args['highway'], args['co_attention'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') if num_classes <= 2: model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) else: print('compile model with categorical cross-entropy') model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) class_weight = None if args['dataset'] == 'Quora': #class_weight = {0:1, 1:2} print('apply class weight:', class_weight) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) model.fit( train_dataset, train_dataset['sim'], #validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], class_weight=class_weight, verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) # load trained vocab embedding. trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #print(new_model.summary()) for layer_id in range(len(model.layers)): layer = model.layers[layer_id] if layer.name != 'word-embedding': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') val_predictions = new_model.predict(val_dataset) predictions = new_model.predict(test_dataset) if dataset_name == 'twitter' or dataset_name == 'TrecQA': val_predictions = val_predictions[:, 1] predictions = predictions[:, 1] print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write( "%s %.4f %s\n" % (val_dataset['id'][i], val_predictions[i], args['mode'])) map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"]) print('write val predictions with trec format to %s' % val_predictions_file) print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) else: preds = np.argmax(predictions, axis=-1) labels = np.argmax(test_dataset['sim'], axis=-1) corrects = preds == labels predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: f.write("id label pred prob model\n") for i in range(len(preds)): f.write("%s %s %s %.4f %s\n" % (test_dataset['id'][i], labels[i], preds[i], predictions[i][preds[i]], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_preds = np.argmax(val_predictions, axis=-1) val_labels = np.argmax(val_dataset['sim'], axis=-1) val_corrects = val_preds == val_labels val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write("%s %s %s %.4f %s\n" % (val_dataset['id'][i], val_labels[i], val_preds[i], val_predictions[i][val_preds[i]], args['mode'])) print('write val predictions with trec format to %s' % val_predictions_file) print('val accuracy: %.4f' % (np.count_nonzero(val_corrects) * 1.0 / len(val_preds))) print('accuracy: %.4f' % (np.count_nonzero(corrects) * 1.0 / len(preds))) macro_prec = precision_score(labels, preds, average="macro") macro_recall = recall_score(labels, preds, average="macro") print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' % (macro_prec, macro_recall, 2 * macro_prec * macro_recall / (macro_prec + macro_recall))) print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision_score(labels, preds, average="micro"), recall_score(labels, preds, average="micro"), f1_score(labels, preds, average="micro"))) print('Confusion matrix:', confusion_matrix(labels, preds))
def main(options): args = get_default_args() load_best_args(args, options, get_best_args()) set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014'] train_set.remove(test_name) else: train_set = [train_name] test_set = test_name print('train_set: {}, test_set: {}'.format(train_set, test_set)) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], [test_set], vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val_set(args["raw_data"], train_set, val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle: id {}, sim {}, query_word_input'.format( train_dataset['id'][:3], train_dataset['sim'][:3], train_dataset['query_word_input'][:3])) # merge the vocabulory of train and test set merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(test_vocab['3gram']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:]) model.fit(train_dataset, train_dataset['sim'], validation_data=(val_dataset, val_dataset['sim']), batch_size=args['batch_size'], epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if layer.name != 'sequential_2': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def main(argv): ''' Parse args, init dataloader ''' foldNum, dataset, subtitle, rating_file, usr2labels_file = parseArgs( argv[:4], **dict(arg.split('=') for arg in argv[4:])) if rating_file and usr2labels_file: dataloader = DATA2LOADER[dataset]( rating_file=rating_file, usr2labels_file=usr2labels_file, sub=subtitle, ) else: dataloader = DATA2LOADER[dataset]() ''' Load training conifgs ''' NEG_SAMPLE_NUM, \ ITEM_FIELDS_NUM, \ MAX_TRAIN_NUM, \ LEARNING_RATE, \ MOMENTUM, \ LAMBDA = dataloader.getTrainingConf() ''' Load each usr's BOI (and for valid data) ''' usr2itemsIndx, ind2itemNum = dataloader.load() usrs = map(lambda usr: usr, usr2itemsIndx) ''' Assert enough usrs ''' if foldNum > len(usrs): s = ' '.join(['foldNum: ', str(foldNum), '>', 'usrNums:', str(usrs)]) raise Exception(s) ''' Acquire (for all usrs) usr2labels & usr2NonzeroCols ''' usr2labels, usr2NonzeroCols = dataloader.get_labels(usrs) ''' Init Baseupdator ''' baseupdator = Baseupdator(*dataloader.getTrainingConf()) ''' K-fold validation ''' kfolds = splitKfolds(usr2itemsIndx, foldNum) for ind, fold in enumerate(kfolds): # Init train/valid folds usr2itemsIndxValid = fold usr2itemsIndxTrain = {} for tind, tfold in enumerate(kfolds): if ind != tind: usr2itemsIndxTrain = merge_two_dicts(usr2itemsIndxTrain, tfold) # Init statevalidator statevalidator = DATA2VALIDATOR[dataset]( dataset=dataset, datasetSub=dataloader.getDataSub(), curFold=ind, totalFolds=len(kfolds), usr2itemsIndxTrain=usr2itemsIndxTrain, usr2itemsIndxValid=usr2itemsIndxValid, MAX_TRAIN_NUM=MAX_TRAIN_NUM, ITEM_FIELDS_NUM=ITEM_FIELDS_NUM, ) statevalidator.logFoldInfo() ''' acquire (k times) usr2NegativeSamples & usr2negsNonzeroCols ''' cdfByLabels, labelsList = getDistribution(usr2labels) usr2NegativeSamples, usr2negsNonzeroCols = negativeSample( usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM) logging.info('usr2NegativeSamples, usr2negsNonzeroCols created') ''' init V to [-1, 1) ''' numOfItems = len(ind2itemNum) V = 2 * nprandom.rand(numOfItems, ITEM_FIELDS_NUM) - 1 logging.info('V inited, V.shape == ' + str(V.shape) + ' == (num items, itemFeatures length)') ''' init W to [-1, 1); init pooler''' # Warn: assume ITEM_FIELDS_NUM is the same as usr's representation's dimension # (No dimension reduction in pooler!) totalLabelsNum = dataloader.gettotalLabelsNum() W = 2 * nprandom.rand(ITEM_FIELDS_NUM, totalLabelsNum) - 1 pooler = sample_pooler() logging.info('W & pooler inited, W.shape == ' + str(W.shape) + ' == (itemFeatures length, total labels num)') logging.debug(' '.join(['W', str(W)])) logging.debug(' '.join(['V', str(V)])) ''' learn W, V ''' while statevalidator.notConv(): # Init next run statevalidator.nextRun() # NegSampling or not if statevalidator.shouldNegSample(): statevalidator.logStartNegSample() usr2NegativeSamples, usr2negsNonzeroCols = negativeSample( usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM) statevalidator.logNegSampleInfo(usr2NegativeSamples) for usrid in usr2itemsIndxTrain: # Pooling usr_rep = pooler.pool_all(usr2itemsIndxTrain[usrid], V) # Get y, sumedW(for y AND negs), sigmoids(for y AND negs) y, y_nonzeroCols, itemsIndx, sumedW_y, sigmoid_y, \ y_negsNonzeroCols, sumedW_negs, sigmoids_negs, \ sigmoidedSumedW = baseupdator.getTerms( usrid, usr2labels, usr2NonzeroCols, usr2itemsIndxTrain, W, usr_rep, usr2negsNonzeroCols,) # Get gradient of Wq (i.e. q-th column of W) gradsOfW = baseupdator.getGradsOfW( W, y_nonzeroCols, sigmoid_y, usr_rep, sigmoids_negs, y_negsNonzeroCols, ) # Get gradient of Vitem gradsOfV = baseupdator.getGradsOfV( V, itemsIndx, sumedW_y, sigmoid_y, sigmoidedSumedW, ) # Update W, V by usr, not by epoch # Update gradients to W, V W, V = baseupdator.updateByGradients( W, V, gradsOfW, gradsOfV, statevalidator.incrInd, ) # Reveal stats/predictions if statevalidator.shouldRevealStats(): # Cal loss if needed if statevalidator.shouldCalLoss(): loss = baseupdator.getLoss( W, V, usr2NonzeroCols, usr2negsNonzeroCols, usr2itemsIndxTrain, pooler, ) statevalidator.updateLossState(loss) statevalidator.logLossStates(W, V, loss) # Do predictions statevalidator.logStartPrediction() dataStats = statevalidator.getDataStats( usr2itemsIndxValid, usr2itemsIndxTrain, usr2NonzeroCols) for d in dataStats: usr2itemsIndx = d['usr2itemsIndx'] u2predictions = d['u2predictions'] for usrid in usr2itemsIndx: usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V) bestCols = baseupdator.predictLabels( usr_rep, W, dataloader.getBds()) u2predictions[usrid] = bestCols # Collect Stats statevalidator.logCollectingStats() KPI2getters = { 'microF1': getMicroF1ByCol, 'oneError': getOneError, 'RL': getRL, 'coverage': getCoverage, 'avgPrec': getAvgPrecision, 'hammingLoss': getHammingLoss, } for d in dataStats: KPIArgs = { 'W': W, 'V': V, 'usr2itemsIndx': d['usr2itemsIndx'], 'usr2NonzeroCols': usr2NonzeroCols, 'u2predictions': d['u2predictions'], 'totalLabelsNum': dataloader.gettotalLabelsNum(), 'rlPairsCnt': dataloader.getRLPairsCnt(), } d['KPIs'] = { kpi: getter(KPIArgs) for kpi, getter in KPI2getters.iteritems() } # OR (no write): statevalidator.logStats(d) statevalidator.writeCSVStats(d) # Log real, predicted if not TEST_SNE: for d in dataStats: statevalidator.logRealPredictedVals(d) return 1
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len, max_doc_len, max_url_len, args): if is_train: vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX vocab['url']['PAD_URL_INDEX'] = PAD_WORD_INDEX vocab['url']['OOV_URL_INDEX'] = OOV_WORD_INDEX query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], [] all_url_list, all_ids_list, all_sim_list = [], [], [] t0 = time.time() for data_name in datasets: # there can be multiple data sets combined as the train or test data data_folder = "%s/%s" % (path, data_name) print('load dataset %s' % data_name) t = time.time() q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) url_list, max_url_len_dataset = read_urls("%s/url.txt" % data_folder, vocab, is_train, '3gram') ids_list = read_metadata("%s/id.txt" % data_folder) if is_train: max_query_len['word'] = max(max_query_len['word'], max_q1_word_len) max_query_len['3gram'] = max(max_query_len['3gram'], max_q1_3gram_len) max_doc_len['word'] = max(max_doc_len['word'], max_q2_word_len) max_doc_len['3gram'] = max(max_doc_len['3gram'], min(max_q2_3gram_len, MAX_TWEET_LENGTH)) max_url_len['url'] = max(max_url_len['url'], min(max_url_len_dataset, MAX_URL_LENGTH)) sim_list = read_relevance("%s/sim.txt" % data_folder) query_word_list.extend(q1_word_list) doc_word_list.extend(q2_word_list) query_3gram_list.extend(q1_3gram_list) doc_3gram_list.extend(q2_3gram_list) all_url_list.extend(url_list) all_ids_list.extend(ids_list) all_sim_list.extend(sim_list) print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" % (max_q1_word_len, max_q2_word_len, max_query_len['word'], max_doc_len['word'])) print( "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" % (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'], max_doc_len['3gram'])) print('max_url_len: %d, limit: %d' % (max_url_len_dataset, max_url_len['url'])) print('load dataset done: %d' % (time.time() - t)) # question padding data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)} data['query_word_input'] = pad_sequences(query_word_list, maxlen=max_query_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_word_mask'] = create_masks(data['query_word_input'], args) data['doc_word_input'] = pad_sequences(doc_word_list, maxlen=max_doc_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_word_mask'] = create_masks(data['doc_word_input'], args) data['query_3gram_input'] = pad_sequences(query_3gram_list, maxlen=max_query_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args) data['doc_3gram_input'] = pad_sequences(doc_3gram_list, maxlen=max_doc_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args) data['url_3gram_input'] = pad_sequences(all_url_list, maxlen=max_url_len['url'], value=PAD_WORD_INDEX, padding='post', truncating='pre') data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args) if os.path.exists("%s/collection_ngram_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_ngram_idf.json" % path, "r")) vocab_inv = invert_dict(vocab['3gram']) data['query_3gram_weight'] = inject_ngram_weight( data['query_3gram_input'], vocab_inv, weights) data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'], vocab_inv, weights) vocab_inv = invert_dict(vocab['url']) data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'], vocab_inv, weights) print('ngram weight injection done: %d' % (time.time() - t)) if os.path.exists("%s/collection_word_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_word_idf.json" % path, "r")) merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word']) vocab_inv = invert_dict(merge_vocab) data['query_word_weight'] = inject_word_weight( data['query_word_input'], vocab_inv, weights) data['doc_word_weight'] = inject_word_weight(data['doc_word_input'], vocab_inv, weights) data['overlap_feat'] = compute_overlap_feat(data['query_word_input'], data['doc_word_input'], vocab_inv, weights) print('word weight injection done: %d' % (time.time() - t)) print('data creation is done: %d' % (time.time() - t0)) return data
def main(): opt = COptions(args) opt_t = COptions(args) loadpath = (opt.data_dir + "/" + opt.data_name) print "loadpath:" + loadpath x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] wordtoix, ixtoword = x[3], x[4] if opt.test: test_file = opt.data_dir + opt.test_file test = read_test(test_file, wordtoix) opt.n_words = len(ixtoword) opt_t.n_words = len(ixtoword) opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1 opt_t.update_params(args) print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") print dict(opt) print('Total words: %d' % opt.n_words) for d in ['/gpu:0']: with tf.device(d): src_ = [tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) for _ in range(opt.n_context)] tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len]) is_train_ = tf.placeholder(tf.bool, name = 'is_train') res_1_ = get_features(src_, tgt_, is_train_, opt, opt_t) merged = tf.summary.merge_all() uidx = 0 graph_options=tf.GraphOptions(build_cost_model=1) config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=graph_options) np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) saver = tf.train.Saver() run_metadata = tf.RunMetadata() with tf.Session(config = config) as sess: train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) sess.run(tf.global_variables_initializer()) if opt.restore: try: t_vars = tf.trainable_variables() if opt.load_from_pretrain: d_vars = [var for var in t_vars if var.name.startswith('d_')] l_vars = [var for var in t_vars if var.name.startswith('l_')] restore_from_save(d_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.global_d) if opt.local_feature: restore_from_save(l_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.local_d) else: loader = restore_from_save(t_vars, sess, opt, load_path = opt.save_path) except Exception as e: print 'Error: '+str(e) print("No saving session, using random initialization") sess.run(tf.global_variables_initializer()) loss_d , loss_g = 0, 0 if opt.test: iter_num = np.int(np.floor(len(test)/opt.batch_size))+1 z_all, z_all_l = [], [] for i in range(iter_num): test_index = range(i * opt.batch_size,(i+1) * opt.batch_size) sents = [test[t%len(test)] for t in test_index] src = [[sents[i][0] for i in range(opt.batch_size)]] tgt = [sents[i][0] for i in range(opt.batch_size)] x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] print "Source:" + u' '.join([ixtoword[x] for s in x_batch for x in s[0] if x != 0]).encode('utf-8').strip() y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) res_1 = sess.run(res_1_, feed_dict=feed) z_all.extend(res_1['z']) z_all_l.extend(res_1['z_l']) save_path_z = opt.log_path + '.global.z.txt' print save_path_z if os.path.exists(save_path_z): os.remove(save_path_z) with open(save_path_z, "a") as myfile: for line in z_all[:len(test)]: for z_it in line: myfile.write(str(z_it) + '\t') myfile.write('\n') save_path_z = opt.log_path + '.local.z.txt' print save_path_z if os.path.exists(save_path_z): os.remove(save_path_z) with open(save_path_z, "a") as myfile: for line in z_all_l[:len(test)]: for z_it in line: myfile.write(str(z_it) + '\t') myfile.write('\n')