def compile_all_run_vars(list_dict, iter_var_idxs):
    """
	Grab all the run variables from the specifications file, and aggregate
	as a complete dictionary of variables to be specified and/or overriden
	in the four_state_receptor object.
	
	Args:
		list_dict: dictionary containing 5 keys; iter_vars, rel_vars, 
			iter_vars, fixed_vars, params, and run_specs. These are read 
			through read_specs_file(...) function in this module.
		iter_var_idxs: list of length len(list_dict['iter_vars']) which 
			contains the indices of the iterated variable range at which
			to evaluate the iterated variables in this run.
			
	Returns:
		vars_to_pass: dictionary whose keys are all variables to be overriden
			in the four_state_receptor class when initialized.
	"""

    vars_to_pass = dict()
    vars_to_pass = parse_iterated_vars(list_dict['iter_vars'], iter_var_idxs,
                                       vars_to_pass)
    vars_to_pass = parse_relative_vars(list_dict['rel_vars'],
                                       list_dict['iter_vars'], vars_to_pass)
    vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['fixed_vars'])
    vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['params'])

    return vars_to_pass
 def request_extra_list(self, ):
     if self.extra_request_path and self.listitems:
         self.extra_listitems = apis.tmdb_api_request(
             self.extra_request_path, **self.request_kwparams)
         self.extra_listitems = self.extra_listitems.get(
             self.extra_request_key,
             []) if self.extra_request_key else self.extra_listitems
         self.listitems[0] = utils.merge_two_dicts(self.extra_listitems,
                                                   self.listitems[0])
Beispiel #3
0
def test(opt, dset, model):
    dset.set_mode(opt.mode)
    torch.set_grad_enabled(False)
    model.eval()
    valid_loader = DataLoader(dset, batch_size=opt.test_bsz, shuffle=False, collate_fn=pad_collate)

    qid2preds = {}
    qid2targets = {}
    for valid_idx, batch in tqdm(enumerate(valid_loader)):
        model_inputs, targets, qids = preprocess_inputs(batch, opt.max_sub_l, opt.max_vcpt_l, opt.max_vid_l,
                                                        device=opt.device)
        outputs = model(*model_inputs)
        pred_ids = outputs.data.max(1)[1].cpu().numpy().tolist()
        cur_qid2preds = {qid: pred for qid, pred in zip(qids, pred_ids)}
        qid2preds = merge_two_dicts(qid2preds, cur_qid2preds)
        cur_qid2targets = {qid:  target for qid, target in zip(qids, targets)}
        qid2targets = merge_two_dicts(qid2targets, cur_qid2targets)
    return qid2preds, qid2targets
Beispiel #4
0
def calculate_tuning_curves(data_flag):

    list_dict = read_specs_file(data_flag)
    for key in list_dict:
        exec("%s = list_dict[key]" % key)

    # Get the iterated variable dimensions
    iter_vars_dims = []
    for iter_var in iter_vars:
        iter_vars_dims.append(len(iter_vars[iter_var]))
    it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index'])

    # Set up array to hold tuning curve curves
    tuning_curve = sp.zeros(
        (iter_vars_dims[0], iter_vars_dims[1], params['Nn'], params['Mm']))

    # Set array to hold epsilons and Kk2
    epsilons = sp.zeros((iter_vars_dims[0], iter_vars_dims[1], params['Mm']))
    Kk2s = sp.zeros(
        (iter_vars_dims[0], iter_vars_dims[1], params['Mm'], params['Nn']))

    # Iterate tuning curve calculation over all iterable variables
    while not it.finished:
        iter_var_idxs = it.multi_index

        vars_to_pass = dict()
        vars_to_pass = parse_iterated_vars(iter_vars, iter_var_idxs,
                                           vars_to_pass)
        vars_to_pass = parse_relative_vars(rel_vars, iter_vars, vars_to_pass)
        vars_to_pass = merge_two_dicts(vars_to_pass, fixed_vars)
        vars_to_pass = merge_two_dicts(vars_to_pass, params)

        # Calculate tuning curve
        for iN in range(vars_to_pass['Nn']):
            vars_to_pass['manual_dSs_idxs'] = sp.array([iN])
            obj = single_encode_CS(vars_to_pass, run_specs)
            tuning_curve[iter_var_idxs[0], iter_var_idxs[1], iN, :] = obj.dYy

        epsilons[it.multi_index] = obj.eps
        Kk2s[it.multi_index] = obj.Kk2

        it.iternext()

    save_tuning_curve(tuning_curve, epsilons, Kk2s, data_flag)
def show_with_brackets(
    tournament_name,
    event,
    tournament_params=[],
):
    """Returns tournament meta information along with a list of bracketIds for an event"""
    tournament = show(tournament_name, tournament_params)
    brackets = event_brackets(tournament_name, event)

    return utils.merge_two_dicts(tournament, brackets)
Beispiel #6
0
def compile_all_run_vars(list_dict):
    """
	Grab all the run variables from the specifications file, and aggregate
	as a complete dictionary of variables to be specified and/or overriden
	in the single_cell_FRET_VA object.
	
	Args:
		list_dict: dictionary containing at least 2 keys; data_vars and 
			est_vars. These are read through read_specs_file()
			function in this module. Only these two keys are read
			in this module; other keys may exist, but will be ignored.
		
	Returns:
		vars_to_pass: dictionary whose keys are all variables to be overriden
			in the single_cell_FRET_VA class when initialized.
	"""

    vars_to_pass = dict()
    vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['data_vars'])
    vars_to_pass = merge_two_dicts(vars_to_pass, list_dict['est_vars'])

    return vars_to_pass
Beispiel #7
0
def translate_cities(tbl, col):
    u_cities = tbl[col].unique()
    print("Distinct number of cities: ", len(u_cities))
    print("Translating cities")
    if os.path.exists('trans_cities.json'):
        old_translations = json.load(open('trans_cities.json'))
        new_translations = list(set(u_cities) - old_translations.keys())
        print("Number of new translations needed for cities: ", len(new_translations))
        trans_cities = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations
    else:
        trans_cities = translate_list(u_cities)
    utils.save_dict_as_json(trans_cities, 'trans_cities')
    return tbl[col].map(lambda x: trans_cities[x] if x != None else x)
Beispiel #8
0
def translate_guests(guests):

    print("Translating months")
    guests['membershipMonth'] = guests.membershipMonth.map(lambda x: s.months_translated[x] if x in s.months_translated.keys() else x)

    guests['linkedAccountVerified'] = guests['linkedAccountVerified'].apply(lambda x: format_verified(x) if x != None else [])
    veri_set = set()
    for i in guests['linkedAccountVerified']:
        veri_set |= set(i)

    print("Distinct number of verifications: ", len(veri_set))
    print("Translating verifications")
    if os.path.exists('trans_verified.json'):
        old_translations = json.load(open('trans_verified.json'))
        new_translations = list(veri_set - old_translations.keys())
        print("Number of new translations needed for verifications: ", len(new_translations))
        trans_verified = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations
    else:
        u_verified = list(veri_set)
        trans_verified = translate_list(u_verified)
    utils.save_dict_as_json(trans_verified, 'trans_verified')
    guests['linkedAccountVerified'] = guests.linkedAccountVerified.map(lambda x: str(trans_verified_list(x, trans_verified)) if x != None else None)

    u_cities = guests['city'].unique()
    print("Distinct number of cities: ", len(u_cities))
    print("Translating cities")
    if os.path.exists('trans_cities.json'):
        old_translations = json.load(open('trans_cities.json'))
        new_translations = list(set(u_cities) - old_translations.keys())
        print("Number of new translations needed for cities: ", len(new_translations))
        trans_cities = utils.merge_two_dicts(old_translations, translate_list(new_translations)) if len(new_translations) > 0 else old_translations
    else:
        trans_cities = translate_list(u_cities)
    utils.save_dict_as_json(trans_cities, 'trans_cities')
    guests['city'] = translate_cities(guests, 'city')

    return guests
Beispiel #9
0
def prepareColor(color):
    additionalInfo = {
        'lab':
        np.asarray(
            [color['l'] * 255 / 100, color['a'] + 128, color['b'] + 128],
            np.uint8),
        'lastPlayed':
        False,
        'pcX':
        0,
        'pcY':
        0
    }

    return utils.merge_two_dicts(color, additionalInfo)
def parse_sequence_example(example, features_config, truncate_sequence_length=20):

    # Define how to parse the example
    context_features = {}
    features_config_single = features_config['single_features']
    for feature_name in features_config_single:        
        context_features[feature_name] = tf.FixedLenFeature([], 
                      dtype=get_tf_dtype(features_config_single[feature_name]['dtype']))

    
    sequence_features = {}
    features_config_sequence = features_config['sequence_features']
    for feature_name in features_config_sequence:        
        sequence_features[feature_name] = tf.FixedLenSequenceFeature(shape=[], 
                      dtype=get_tf_dtype(features_config_sequence[feature_name]['dtype']))

    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        example, 
        sequence_features=sequence_features,
        context_features=context_features,
        example_name="example"
    )


    #Truncate long sessions to a limit
    context_parsed['session_size'] = tf.minimum(context_parsed['session_size'], 
                                                truncate_sequence_length)
    for feature_name in sequence_parsed:
        sequence_parsed[feature_name] = sequence_parsed[feature_name][:truncate_sequence_length] 
    
    
    #Ignoring first click from labels
    sequence_parsed['label_next_item'] = sequence_parsed['item_clicked'][1:]    
    #Making it easy to retrieve the last label
    sequence_parsed['label_last_item'] = sequence_parsed['item_clicked'][-1:]
    
    #Ignoring last clicked item from input    
    for feature_key in sequence_features:
        if feature_key not in ['label_next_item', 'label_last_item']:
            sequence_parsed[feature_key] = sequence_parsed[feature_key][:-1]
    
    merged_features = merge_two_dicts(context_parsed, sequence_parsed)

    #In order the pad the dataset, I had to use this hack to expand scalars to vectors.
    merged_expanded_features = expand_single_features(merged_features,
                                      features_to_expand=list(features_config['single_features'].keys()))

    return merged_expanded_features
Beispiel #11
0
 def get_cached_data(self, item=None, tmdb_type=None):
     if tmdb_type and item:
         if item.get('show_id') or item.get('id'):
             if item.get('show_id'):
                 my_id = item.get('show_id')
                 my_request = 'tv'
             elif item.get('id'):
                 my_id = item.get('id')
                 my_request = tmdb_type
             request_path = '{0}/{1}'.format(my_request, my_id)
             kwparams = {}
             kwparams['append_to_response'] = APPEND_TO_RESPONSE
             self.detailed_info = apis.tmdb_api_only_cached(
                 request_path, **kwparams)
             if self.detailed_info:
                 item = utils.merge_two_dicts(self.detailed_info, item)
                 if item.get('imdb_id') and my_request in ['movie', 'tv']:
                     self.omdb_info = apis.omdb_api_only_cached(
                         i=item.get('imdb_id'))
Beispiel #12
0
def get_possible_drags(player_1_board, player_2_board, player):
    """
    Returns the set of possible drags given the board state and current player.
    Parameter player can be either 1 or 2 of int dtype.

    A possible drag has a tuple having two tuples.  First tuple is from-position and second tuple is to-position.
    """
    possible_drags = set()

    # set current_player_board according to the player parameter
    if player == 1:
        current_player_board = player_1_board
    else:
        current_player_board = player_2_board

    for i in current_player_board.values():
        for j in VALID_MOVES[POSITIONS_TO_INDEX[i]]:

            if INDEX_TO_POSITIONS[j] not in merge_two_dicts(
                    player_1_board, player_2_board).values():
                possible_drags.add((i, INDEX_TO_POSITIONS[j]))
    return possible_drags
Beispiel #13
0
    if (len(sys.argv) <= 3) and args.machine_name:
        command += " -F " + args.machine_name
    return command


#starting machine already existing in current lab
if args.hostlab:
    if (os.path.exists(os.path.join(args.hostlab, "lab.conf"))):
        (machines, links, options, metadata) = nc.lab_parse(args.hostlab)
        if machines.get(args.machine_name) != None:
            #creating and updating interfaces in lab.conf
            conf_lines = {}
            if (args.eths != None):
                new_eths = eths_line_writer(args.eths)
                conf_lines = u.merge_two_dicts(
                    u.couple_list_to_dict(machines[args.machine_name]),
                    new_eths)
            else:
                conf_lines = u.couple_list_to_dict(machines[args.machine_name])
            conf_lines = conf_line_writer(conf_lines)

            create_lab(machine_path, args.machine_name, conf_lines)

            #copying and appending commands to startup file
            startup_path = os.path.join(args.hostlab,
                                        args.machine_name + ".startup")
            if (os.path.exists(startup_path)):
                shutil.copy(
                    startup_path,
                    os.path.join(machine_path, args.machine_name + ".startup"))
            if (args.exe != None):
Beispiel #14
0
def main():
    #global n_words
    # Prepare training and testing data

    opt = COptions(args)
    opt_t = COptions(args)

    loadpath = (opt.data_dir + "/" + opt.data_name)
    print "loadpath:" + loadpath
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[3], x[4]

    if opt.test:
        test_file = opt.data_dir + "/newdata2/test.txt"
        test = read_test(test_file, wordtoix)
        test = [
            x for x in test if all(
                [2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)])
        ]
    train_filtered = [
        x for x in train
        if all([2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)])
    ]
    val_filtered = [
        x for x in val
        if all([2 < len(x[t]) < opt.maxlen - 4 for t in range(opt.num_turn)])
    ]
    print("Train: %d => %d" % (len(train), len(train_filtered)))
    print("Val: %d => %d" % (len(val), len(val_filtered)))
    train, val = train_filtered, val_filtered
    del train_filtered, val_filtered

    opt.n_words = len(ixtoword)
    opt_t.n_words = len(ixtoword)
    opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1
    opt_t.update_params(args)
    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

    for d in ['/gpu:0']:
        with tf.device(d):
            src_ = [
                tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
                for _ in range(opt.n_context)
            ]
            tgt_ = tf.placeholder(tf.int32,
                                  shape=[opt_t.batch_size, opt_t.sent_len])
            is_train_ = tf.placeholder(tf.bool, name='is_train')
            res_, gan_cost_g_, train_op_g = conditional_s2s(
                src_, tgt_, is_train_, opt, opt_t)
            merged = tf.summary.merge_all()

    uidx = 0
    graph_options = tf.GraphOptions(build_cost_model=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True,
                            graph_options=graph_options)
    config.gpu_options.per_process_gpu_memory_fraction = 0.90

    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    run_metadata = tf.RunMetadata()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:

                t_vars = tf.trainable_variables()

                if opt.load_from_pretrain:
                    d_vars = [
                        var for var in t_vars if var.name.startswith('d_')
                    ]
                    g_vars = [
                        var for var in t_vars if var.name.startswith('g_')
                    ]
                    l_vars = [
                        var for var in t_vars if var.name.startswith('l_')
                    ]
                    restore_from_save(d_vars,
                                      sess,
                                      opt,
                                      load_path=opt.restore_dir + "/save/" +
                                      opt.global_d)
                    if opt.local_feature:
                        restore_from_save(l_vars,
                                          sess,
                                          opt,
                                          load_path=opt.restore_dir +
                                          "/save/" + opt.local_d)
                else:
                    loader = restore_from_save(t_vars,
                                               sess,
                                               opt,
                                               load_path=opt.save_path)

            except Exception as e:
                print 'Error: ' + str(e)
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())
        loss_d, loss_g = 0, 0

        if opt.test:
            iter_num = np.int(np.floor(len(test) / opt.batch_size)) + 1
            res_all = []
            for i in range(iter_num):
                test_index = range(i * opt.batch_size,
                                   (i + 1) * opt.batch_size)
                sents = [val[t] for t in test_index]
                for idx in range(opt.n_context, opt.num_turn):
                    src = [[
                        sents[i][idx - turn] for i in range(opt.batch_size)
                    ] for turn in range(opt.n_context, 0, -1)]
                    tgt = [sents[i][idx] for i in range(opt.batch_size)]
                    x_batch = [
                        prepare_data_for_cnn(src_i, opt) for src_i in src
                    ]  # Batch L
                    y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO=False)
                    feed = merge_two_dicts(
                        {i: d
                         for i, d in zip(src_, x_batch)}, {
                             tgt_: y_batch,
                             is_train_: 0
                         })  # do not use False
                    res = sess.run(res_, feed_dict=feed)
                    res_all.extend(res['syn_sent'])

            # bp()
            res_all = reshaping(res_all, opt)

            for idx in range(len(test) * (opt.num_turn - opt.n_context)):
                with open(opt.log_path + '.resp.txt', "a") as resp_f:
                    resp_f.write(u' '.join([
                        ixtoword[x] for x in res_all[idx] if x != 0 and x != 2
                    ]).encode('utf-8').strip() + (
                        '\n' if idx %
                        (opt.num_turn - opt.n_context) == 0 else '\t'))
            print("save to:" + opt.log_path + '.resp.txt')
            exit(0)

        for epoch in range(opt.max_epochs):
            print("Starting epoch %d" % epoch)
            kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
            for _, train_index in kf:
                uidx += 1
                sents = [train[t] for t in train_index]
                for idx in range(opt.n_context, opt.num_turn):
                    src = [[
                        sents[i][idx - turn] for i in range(opt.batch_size)
                    ] for turn in range(opt.n_context, 0, -1)]
                    tgt = [sents[i][idx] for i in range(opt.batch_size)]

                    x_batch = [
                        prepare_data_for_cnn(src_i, opt) for src_i in src
                    ]  # Batch L

                    y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO=False)

                    feed = merge_two_dicts(
                        {i: d
                         for i, d in zip(src_, x_batch)}, {
                             tgt_: y_batch,
                             is_train_: 1
                         })

                    _, loss_g = sess.run([train_op_g, gan_cost_g_],
                                         feed_dict=feed)

                if uidx % opt.print_freq == 0:
                    print("Iteration %d: loss G %f" % (uidx, loss_g))
                    res = sess.run(res_, feed_dict=feed)
                    if opt.global_feature:
                        print "z loss: " + str(res['z_loss'])
                    if "nn" in opt.agg_model:
                        print "z pred_loss: " + str(res['z_loss_pred'])
                    print "Source:" + u' '.join(
                        [ixtoword[x] for s in x_batch
                         for x in s[0] if x != 0]).encode('utf-8').strip()
                    print "Target:" + u' '.join([
                        ixtoword[x] for x in y_batch[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Generated:" + u' '.join([
                        ixtoword[x] for x in res['syn_sent'][0] if x != 0
                    ]).encode('utf-8').strip()
                    print ""

                    sys.stdout.flush()
                    summary = sess.run(merged, feed_dict=feed)
                    train_writer.add_summary(summary, uidx)

                if uidx % opt.valid_freq == 1:
                    VALID_SIZE = 4096
                    valid_multiplier = np.int(
                        np.floor(VALID_SIZE / opt.batch_size))
                    res_all, val_tgt_all, loss_val_g_all = [], [], []
                    if opt.global_feature:
                        z_loss_all = []
                    for val_step in range(valid_multiplier):
                        valid_index = np.random.choice(len(val),
                                                       opt.batch_size)
                        sents = [val[t] for t in valid_index]
                        for idx in range(opt.n_context, opt.num_turn):
                            src = [[
                                sents[i][idx - turn]
                                for i in range(opt.batch_size)
                            ] for turn in range(opt.n_context, 0, -1)]
                            tgt = [
                                sents[i][idx] for i in range(opt.batch_size)
                            ]

                            val_tgt_all.extend(tgt)

                            x_batch = [
                                prepare_data_for_cnn(src_i, opt)
                                for src_i in src
                            ]  # Batch L

                            y_batch = prepare_data_for_rnn(tgt,
                                                           opt_t,
                                                           is_add_GO=False)

                            feed = merge_two_dicts(
                                {i: d
                                 for i, d in zip(src_, x_batch)}, {
                                     tgt_: y_batch,
                                     is_train_: 0
                                 })  # do not use False

                            loss_val_g = sess.run([gan_cost_g_],
                                                  feed_dict=feed)
                            loss_val_g_all.append(loss_val_g)

                            res = sess.run(res_, feed_dict=feed)
                            res_all.extend(res['syn_sent'])
                        if opt.global_feature:
                            z_loss_all.append(res['z_loss'])

                    print("Validation:  loss G %f " %
                          (np.mean(loss_val_g_all)))
                    if opt.global_feature:
                        print "z loss: " + str(np.mean(z_loss_all))
                    print "Val Source:" + u' '.join(
                        [ixtoword[x] for s in x_batch
                         for x in s[0] if x != 0]).encode('utf-8').strip()
                    print "Val Target:" + u' '.join([
                        ixtoword[x] for x in y_batch[0] if x != 0
                    ]).encode('utf-8').strip()
                    print "Val Generated:" + u' '.join([
                        ixtoword[x] for x in res['syn_sent'][0] if x != 0
                    ]).encode('utf-8').strip()
                    print ""
                    if opt.global_feature:
                        with open(opt.log_path + '.z.txt', "a") as myfile:
                            myfile.write("Iteration" + str(uidx) + "\n")
                            myfile.write("z_loss %f" % (np.mean(z_loss_all)) +
                                         "\n")
                            myfile.write("Val Source:" + u' '.join([
                                ixtoword[x] for s in x_batch
                                for x in s[0] if x != 0
                            ]).encode('utf-8').strip() + "\n")
                            myfile.write("Val Target:" + u' '.join(
                                [ixtoword[x] for x in y_batch[0]
                                 if x != 0]).encode('utf-8').strip() + "\n")
                            myfile.write("Val Generated:" + u' '.join([
                                ixtoword[x]
                                for x in res['syn_sent'][0] if x != 0
                            ]).encode('utf-8').strip() + "\n")
                            myfile.write("Z_input, Z_recon, Z_tgt")
                            myfile.write(
                                np.array2string(res['z'][0],
                                                formatter={
                                                    'float_kind':
                                                    lambda x: "%.2f" % x
                                                }) + "\n")
                            myfile.write(
                                np.array2string(res['z_hat'][0],
                                                formatter={
                                                    'float_kind':
                                                    lambda x: "%.2f" % x
                                                }) + "\n\n")
                            myfile.write(
                                np.array2string(res['z_tgt'][0],
                                                formatter={
                                                    'float_kind':
                                                    lambda x: "%.2f" % x
                                                }) + "\n\n")

                    val_set = [prepare_for_bleu(s) for s in val_tgt_all]
                    gen = [prepare_for_bleu(s) for s in res_all]
                    [bleu1s, bleu2s, bleu3s,
                     bleu4s] = cal_BLEU_4(gen, {0: val_set},
                                          is_corpus=opt.is_corpus)
                    etp_score, dist_score = cal_entropy(gen)

                    print 'Val BLEU: ' + ' '.join([
                        str(round(it, 3))
                        for it in (bleu1s, bleu2s, bleu3s, bleu4s)
                    ])
                    print 'Val Entropy: ' + ' '.join([
                        str(round(it, 3))
                        for it in (etp_score[0], etp_score[1], etp_score[2],
                                   etp_score[3])
                    ])
                    print 'Val Diversity: ' + ' '.join([
                        str(round(it, 3))
                        for it in (dist_score[0], dist_score[1], dist_score[2],
                                   dist_score[3])
                    ])
                    print 'Val Avg. length: ' + str(
                        round(
                            np.mean([
                                len([y for y in x if y != 0]) for x in res_all
                            ]), 3))
                    print ""
                    summary = sess.run(merged, feed_dict=feed)
                    summary2 = tf.Summary(value=[
                        tf.Summary.Value(tag="bleu-2", simple_value=bleu2s),
                        tf.Summary.Value(tag="etp-4",
                                         simple_value=etp_score[3])
                    ])

                    test_writer.add_summary(summary, uidx)
                    test_writer.add_summary(summary2, uidx)

                if uidx % opt.save_freq == 0:
                    saver.save(sess, opt.save_path)
Beispiel #15
0
def main():
    #global n_words
    # Prepare training and testing data

    opt = COptions(args)
    opt_t = COptions(args)
    # opt_t.n_hid = opt.n_z

    loadpath = (opt.data_dir + "/" + opt.data_name) #if opt.not_philly else '/hdfs/msrlabs/xiag/pt-data/cons/data_cleaned/twitter_small.p'
    print "loadpath:" + loadpath
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[3], x[4]

    if opt.test:
        test_file = opt.data_dir + opt.test_file 
        test = read_test(test_file, wordtoix)
        # test = [ x for x in test if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])]
    # train_filtered = [ x for x in train if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])]
    # val_filtered = [ x for x in val if all([2<len(x[t])<opt.maxlen - 4 for t in range(opt.num_turn)])]
    # print ("Train: %d => %d" % (len(train), len(train_filtered)))
    # print ("Val: %d => %d" % (len(val), len(val_filtered)))
    # train, val = train_filtered, val_filtered
    # del train_filtered, val_filtered

    opt.n_words = len(ixtoword) 
    opt_t.n_words = len(ixtoword)
    opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1
    opt_t.update_params(args)
    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

    # print dict(opt)
    # if opt.model == 'cnn_rnn':
    #     opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1
    #     opt_t.update_params(args)
        # print dict(opt_t)


    #for d in ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3']:
    for d in ['/gpu:0']:
        with tf.device(d):
            src_ = [tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) for _ in range(opt.n_context)]
            tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len])
            z_ = tf.placeholder(tf.float32, shape=[opt_t.batch_size , opt.n_z * (2 if opt.local_feature else 1)])
            is_train_ = tf.placeholder(tf.bool, name = 'is_train')
            res_1_ = get_features(src_, tgt_, is_train_, opt, opt_t)
            res_2_ = generate_resp(src_, tgt_, z_, is_train_, opt, opt_t)
            merged = tf.summary.merge_all()

    #tensorboard --logdir=run1:/tmp/tensorflow/ --port 6006
    #writer = tf.train.SummaryWriter(opt.log_path, graph=tf.get_default_graph())

    uidx = 0
    graph_options=tf.GraphOptions(build_cost_model=1)
    #config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=tf.GraphOptions(build_cost_model=1))
    config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=graph_options)
    # config.gpu_options.per_process_gpu_memory_fraction = 0.70
    #config = tf.ConfigProto(device_count={'GPU':0})
    #config.gpu_options.allow_growth = True

    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    run_metadata = tf.RunMetadata()

    with tf.Session(config = config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:
                #pdb.set_trace()
                t_vars = tf.trainable_variables()  
                #t_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #tf.trainable_variables()

                # if opt.load_from_pretrain:
                #     d_vars = [var for var in t_vars if var.name.startswith('d_')]
                #     g_vars = [var for var in t_vars if var.name.startswith('g_')]
                #     g_vars = [var for var in t_vars if var.name.startswith('g_')]
                #     g_vars = [var for var in t_vars if var.name.startswith('g_')]
                #     g_vars = [var for var in t_vars if var.name.startswith('g_')]
                #     g_vars = [var for var in t_vars if var.name.startswith('g_')]
                #     l_vars = [var for var in t_vars if var.name.startswith('l_')]
                #     #restore_from_save(g_vars, sess, opt, prefix = 'g_', load_path=opt.restore_dir + "/save/generator2")
                #     restore_from_save(d_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.global_d)
                #     if opt.local_feature:
                #         restore_from_save(l_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.local_d)
                # else:
                loader = restore_from_save(t_vars, sess, opt, load_path = opt.save_path)


            except Exception as e:
                print 'Error: '+str(e)
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())
        loss_d , loss_g = 0, 0

        if opt.test:
            iter_num = np.int(np.floor(len(test)/opt.batch_size))+1
            res_all = []
            val_tgt_all =[]
            for i in range(iter_num):
                test_index = range(i * opt.batch_size,(i+1) * opt.batch_size)
                sents = [test[t%len(test)] for t in test_index]
                for idx in range(opt.n_context,opt.num_turn):
                    src = [[sents[i][idx-turn] for i in range(opt.batch_size)] for turn in range(opt.n_context,0,-1)]
                    tgt = [sents[i][idx] for i in range(opt.batch_size)] 
                    val_tgt_all.extend(tgt)
                    if opt.feed_generated and idx!= opt.n_context:
                        src[-1] = [[x for x in p if x!=0] for p in res_all[-opt.batch_size:]]

                    x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] # Batch L
                    y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) 
                    
                    feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) # do not use False
                    res_1 = sess.run(res_1_, feed_dict=feed)
                    z_all = np.array(res_1['z'])

                    
                    feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, z_: z_all, is_train_: 0}) # do not use False
                    res_2 = sess.run(res_2_, feed_dict=feed)
                    res_all.extend(res_2['syn_sent'])

                    # bp()
   
            val_tgt_all = reshaping(val_tgt_all, opt)
            res_all = reshaping(res_all, opt)
            
            save_path = opt.log_path + '.resp.txt'
            if os.path.exists(save_path):
                os.remove(save_path) 
            for idx in range(len(test)*(opt.num_turn-opt.n_context)):
                with open(save_path, "a") as resp_f:
                    resp_f.write(u' '.join([ixtoword[x] for x in res_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') )
            print ("save to:" + save_path)

            if opt.verbose:
                save_path = opt.log_path + '.tgt.txt'
                if os.path.exists(save_path):
                    os.remove(save_path) 
                for idx in range(len(test)*(opt.num_turn-opt.n_context)):
                    with open(save_path, "a") as tgt_f:
                        tgt_f.write(u' '.join([ixtoword[x] for x in val_tgt_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') )
                print ("save to:" + save_path)

            val_set = [prepare_for_bleu(s) for s in val_tgt_all]
            gen = [prepare_for_bleu(s) for s in res_all]
            [bleu1s,bleu2s,bleu3s,bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus = opt.is_corpus)
            etp_score, dist_score = cal_entropy(gen)

            # print save_path
            print 'Val BLEU: ' + ' '.join([str(round(it,3)) for it in (bleu1s,bleu2s,bleu3s,bleu4s)])
            # print 'Val Rouge: ' + ' '.join([str(round(it,3)) for it in (rouge1,rouge2,rouge3,rouge4)])
            print 'Val Entropy: ' + ' '.join([str(round(it,3)) for it in (etp_score[0],etp_score[1],etp_score[2],etp_score[3])])
            print 'Val Diversity: ' + ' '.join([str(round(it,3)) for it in (dist_score[0],dist_score[1],dist_score[2],dist_score[3])])
            # print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])])
            print 'Val Avg. length: ' + str(round(np.mean([len([y for y in x if y!=0]) for x in res_all]),3)) 
            if opt.embedding_score:
                with open("../../ssd0/consistent_dialog/data/GoogleNews-vectors-negative300.bin.p", 'rb') as pfile:
                    embedding = cPickle.load(pfile)
                rel_score = cal_relevance(gen, val_set, embedding)
                print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])])


            if not opt.global_feature or opt.bit == None: exit(0)

        if opt.test:
            iter_num = np.int(np.floor(len(test)/opt.batch_size))+1 
            for int_idx in range(opt.int_num):
                res_all = []
                z1,z2,z3 = [],[],[]
                val_tgt_all =[]
                for i in range(iter_num):
                    test_index = range(i * opt.batch_size,(i+1) * opt.batch_size)
                    sents = [test[t%len(test)] for t in test_index]
                    for idx in range(opt.n_context,opt.num_turn):
                        src = [[sents[i][idx-turn] for i in range(opt.batch_size)] for turn in range(opt.n_context,0,-1)]
                        tgt = [sents[i][idx] for i in range(opt.batch_size)]
                        val_tgt_all.extend(tgt)
                        if opt.feed_generated and idx!= opt.n_context:
                            src[-1] = [[x for x in p if x!=0] for p in res_all[-opt.batch_size:]]

                        x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] # Batch L
                        y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) 
                        feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) # do not use False
                        res_1 = sess.run(res_1_, feed_dict=feed)
                        z_all = np.array(res_1['z'])
                        z_all[:,opt.bit] = np.array([1.0/np.float(opt.int_num-1) * int_idx for _ in range(opt.batch_size)])
                        
                        feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, z_: z_all, is_train_: 0}) # do not use False
                        res_2 = sess.run(res_2_, feed_dict=feed)
                        res_all.extend(res_2['syn_sent'])
                        z1.extend(res_1['z'])                        
                        z2.extend(z_all)
                        z3.extend(res_2['z_hat'])
                        
                        # bp()

                val_tgt_all = reshaping(val_tgt_all, opt)
                res_all = reshaping(res_all, opt)
                z1 = reshaping(z1, opt)
                z2 = reshaping(z2, opt)
                z3 = reshaping(z3, opt)
                
                save_path = opt.log_path  + 'bit' + str(opt.bit) + '.'+ str(1.0/np.float(opt.int_num-1) * int_idx) +'.int.txt'
                if os.path.exists(save_path):
                    os.remove(save_path) 
                for idx in range(len(test)*(opt.num_turn-opt.n_context)):
                    with open(save_path, "a") as resp_f:
                        resp_f.write(u' '.join([ixtoword[x] for x in res_all[idx] if x != 0 and x != 2]).encode('utf-8').strip() + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t') )
                print ("save to:" + save_path)

                save_path_z = opt.log_path  + 'bit' + str(opt.bit) + '.'+ str(1.0/np.float(opt.int_num-1) * int_idx) +'.z.txt'
                if os.path.exists(save_path_z):
                    os.remove(save_path_z) 
                for idx in range(len(test)*(opt.num_turn-opt.n_context)):
                    with open(save_path_z, "a") as myfile:
                        #ary = np.array([z1[idx][opt.bit], z2[idx][opt.bit], z3[idx][opt.bit]])
                        #myfile.write(np.array2string(ary, formatter={'float_kind':lambda x: "%.2f" % x}) + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t'))
                        myfile.write(str(z3[idx][opt.bit]) + ('\n' if idx%(opt.num_turn-opt.n_context) == opt.num_turn-opt.n_context-1 else '\t'))

                
                val_set = [prepare_for_bleu(s) for s in val_tgt_all]
                gen = [prepare_for_bleu(s) for s in res_all]
                [bleu1s,bleu2s,bleu3s,bleu4s] = cal_BLEU_4(gen, {0: val_set}, is_corpus = opt.is_corpus)
                etp_score, dist_score = cal_entropy(gen)

                print save_path
                print 'Val BLEU: ' + ' '.join([str(round(it,3)) for it in (bleu1s,bleu2s,bleu3s,bleu4s)])
                # print 'Val Rouge: ' + ' '.join([str(round(it,3)) for it in (rouge1,rouge2,rouge3,rouge4)])
                print 'Val Entropy: ' + ' '.join([str(round(it,3)) for it in (etp_score[0],etp_score[1],etp_score[2],etp_score[3])])
                print 'Val Diversity: ' + ' '.join([str(round(it,3)) for it in (dist_score[0],dist_score[1],dist_score[2],dist_score[3])])
                # print 'Val Relevance(G,A,E): ' + ' '.join([str(round(it,3)) for it in (rel_score[0],rel_score[1],rel_score[2])])
                print 'Val Avg. length: ' + str(round(np.mean([len([y for y in x if y!=0]) for x in res_all]),3)) 
    parser.add_argument("--model", type=str, required=True, help="model name")
    parser.add_argument("--config_filename", type=str, default="config", help="the filename of config file")

    args, _ = parser.parse_known_args()

    # get config

    model_config_name = args.model

    if "lstm" in args.model:
        model_config_name = "lstm"
    main_config = parser_config(args.config_filename, "main")
    model_config = parser_config(args.config_filename, model_config_name)

    #  configs = {**main_config, **model_config}
    configs = merge_two_dicts(main_config, model_config)
    configs = Config(configs)


    texts, labels = read(args.data_path)

    with open("./tmp_test_content", "r") as f:
        pred = f.readlines()
        pred = list(map(lambda x: x.rstrip("\n"), pred))

    #  vocab = {}
#
    #  for tt in texts + pred:
        #  for ttt in tt.split():
            #  if(ttt not in vocab):
                #  vocab[ttt] = 1
Beispiel #17
0
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len,
             max_doc_len, max_url_len, nb_classes, args):
    if is_train:
        vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX
        vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX
        vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX
        vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX
    query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], []
    all_url_list, all_ids_list, all_sim_list = [], [], []
    for data_name in datasets:  # there can be multiple data sets combined as the train or test data
        data_folder = "%s/%s" % (path, data_name)
        print('creating dataset %s' % data_name)
        t = time.time()
        q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        url_list, max_url_len_dataset = [], 0
        if os.path.exists("%s/url.txt" % data_folder):
            url_list, max_url_len_dataset = read_urls(
                "%s/url.txt" % data_folder, vocab, is_train, '3gram')
        ids_list = read_metadata("%s/id.txt" % data_folder)
        if is_train:
            max_query_len['word'] = max(max_query_len['word'],
                                        min(max_q1_word_len, MAX_WORD_LENGTH))
            max_query_len['3gram'] = max(
                max_query_len['3gram'], min(max_q1_3gram_len,
                                            MAX_3GRAM_LENGTH))
            max_doc_len['word'] = max(max_doc_len['word'],
                                      min(max_q2_word_len, MAX_WORD_LENGTH))
            max_doc_len['3gram'] = max(max_doc_len['3gram'],
                                       min(max_q2_3gram_len, MAX_3GRAM_LENGTH))
            max_url_len['url'] = max(max_url_len['url'],
                                     min(max_url_len_dataset, MAX_URL_LENGTH))
        sim_list = read_relevance("%s/sim.txt" % data_folder)
        categorical_sim_list = np.zeros((len(sim_list), nb_classes),
                                        dtype='int')
        for i, sim in enumerate(sim_list):
            categorical_sim_list[i][sim] = 1
        print(sim_list[:5], categorical_sim_list[:5])
        query_word_list.extend(q1_word_list)
        doc_word_list.extend(q2_word_list)
        query_3gram_list.extend(q1_3gram_list)
        doc_3gram_list.extend(q2_3gram_list)
        all_url_list.extend(url_list)
        all_ids_list.extend(ids_list)
        all_sim_list.extend(categorical_sim_list)
        print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" %
              (max_q1_word_len, max_q2_word_len, max_query_len['word'],
               max_doc_len['word']))
        print(
            "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" %
            (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'],
             max_doc_len['3gram']))
        print('max_url_len: %d, limit: %d' %
              (max_url_len_dataset, max_url_len['url']))
        print('creating dataset done: %d' % (time.time() - t))

    # question padding
    data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)}
    data['query_word_input'] = pad_sequences(query_word_list,
                                             maxlen=max_query_len['word'],
                                             value=PAD_WORD_INDEX,
                                             padding='post',
                                             truncating='post')
    data['query_word_mask'] = create_masks(data['query_word_input'], args)
    data['doc_word_input'] = pad_sequences(doc_word_list,
                                           maxlen=max_doc_len['word'],
                                           value=PAD_WORD_INDEX,
                                           padding='post',
                                           truncating='post')
    data['doc_word_mask'] = create_masks(data['doc_word_input'], args)
    data['query_3gram_input'] = pad_sequences(query_3gram_list,
                                              maxlen=max_query_len['3gram'],
                                              value=PAD_WORD_INDEX,
                                              padding='post',
                                              truncating='post')
    data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args)
    data['doc_3gram_input'] = pad_sequences(doc_3gram_list,
                                            maxlen=max_doc_len['3gram'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='post')
    data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args)
    data['url_3gram_input'] = pad_sequences(all_url_list,
                                            maxlen=max_url_len['url'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='pre')
    data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args)

    if os.path.exists("%s/collection_ngram_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_ngram_idf.json" % path, "r"))
        vocab_inv = invert_dict(vocab['3gram'])
        data['query_3gram_weight'] = inject_ngram_weight(
            data['query_3gram_input'], vocab_inv, weights)
        data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'],
                                                       vocab_inv, weights)
        data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'],
                                                       vocab_inv, weights)
        print('ngram weight injection done: %d' % (time.time() - t))
    else:
        num_samples, max_query_len = data['query_3gram_input'].shape
        data['query_3gram_weight'] = np.ones(
            (num_samples, ATTENTION_DEEP_LEVEL, max_query_len))
        data['doc_3gram_weight'] = np.ones((num_samples, ATTENTION_DEEP_LEVEL,
                                            data['doc_3gram_input'].shape[1]))

    if os.path.exists("%s/collection_word_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_word_idf.json" % path, "r"))
        merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word'])
        vocab_inv = invert_dict(merge_vocab)
        print('inject query IDF weights')
        data['query_word_weight'] = inject_word_weight(
            data['query_word_input'], vocab_inv, weights)
        print('inject doc IDF weights')
        data['doc_word_weight'] = inject_word_weight(data['doc_word_input'],
                                                     vocab_inv, weights)
        data['overlap_feat'] = compute_overlap_feat(data['query_word_input'],
                                                    data['doc_word_input'],
                                                    vocab_inv, weights)
        print('word weight injection done: %d' % (time.time() - t))

    return data
Beispiel #18
0
def main(options):
    args = get_default_args()
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = [test_name]
    print("train_set", train_set)
    print("test_set", test_set)
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab)
        #print('build vocab done. %d' % len(vocab['word']))
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if mode == 'dssm':
        train_dataset = convert_data_to_dssm_format(train_dataset,
                                                    vocab,
                                                    is_train_or_val=True)
        test_dataset = convert_data_to_dssm_format(test_dataset,
                                                   vocab,
                                                   is_train_or_val=False)
        print('data convertion done!')

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val(train_set,
                                 num_samples=num_samples,
                                 val_split=val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    print(val_indices[:5], np.sum(np.array(val_indices)))

    # sample validation set for debug purpose
    # val_indices = val_indices[:100]

    train_dataset["query_word_weight"] = train_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    train_dataset["query_3gram_weight"] = train_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    train_dataset["doc_word_weight"] = train_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    train_dataset["doc_3gram_weight"] = train_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    train_dataset["url_3gram_weight"] = train_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    test_dataset["query_word_weight"] = test_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    test_dataset["query_3gram_weight"] = test_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    test_dataset["doc_word_weight"] = test_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    test_dataset["doc_3gram_weight"] = test_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    test_dataset["url_3gram_weight"] = test_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100])))

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # sample training dataset for debug purpose
    # sample_num = 1000
    # for key in train_dataset:
    #     train_dataset[key] = train_dataset[key][:sample_num]

    # merge the vocabulory of train and test set
    print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" %
          (len(vocab['word']), len(vocab['3gram']), len(vocab['url'])))
    print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len(
        test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url'])))
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d) url(%d)" %
          (len(merged_vocab['word']), len(
              merged_vocab['3gram']), len(merged_vocab['url'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])

    print(vocab_size)
    # Print data samples for debug purpose
    # print_dataset(mode, train_dataset, vocab_inv)
    # print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       external=args["external_feat"],
                                       norm_weight=args['norm_weight'],
                                       cos_norm=args['cos'],
                                       only_word=args['only_word'],
                                       only_char=args['only_char'],
                                       pooling=args['pooling'],
                                       deeplevel=args['deeplevel'])
    elif mode == 'dssm':
        model = create_dssm_model(max_query_len,
                                  max_doc_len,
                                  max_url_len,
                                  vocab_size,
                                  train_vocab_emb,
                                  args["nb_filters"],
                                  embed_size=300,
                                  dropout_rate=args['dropout'],
                                  trainable=args["trainable"])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=False)
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    model_weights, parameter_num = get_model_weights(model)
    print('model init weights sum: {} of {} parameters'.format(
        model_weights, parameter_num))
    #

    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.3,
                                       patience=3,
                                       min_lr=0.0001)

        fit_mode = "fit"
        if fit_mode == "fit":
            model.fit(
                train_dataset,
                train_dataset['sim'],  # validation_split=0.05,
                batch_size=args['batch_size'],
                validation_data=(val_dataset, val_dataset['sim']),
                epochs=args['epochs'],
                shuffle=False,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)
        else:
            train_steps, train_batches = batch_iter(
                train_dataset,
                train_dataset["sim"],
                batch_size=args['batch_size'])
            valid_steps, valid_batches = batch_iter(
                val_dataset, val_dataset["sim"], batch_size=args['batch_size'])
            model.fit_generator(
                train_batches,
                train_steps,
                epochs=args['epochs'],
                validation_data=valid_batches,
                validation_steps=valid_steps,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)

    #plot_model(model, to_file='model.png')
    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        if args["only_char"]:
            merged_vocab_emb = None
        else:
            embedding_layer_name = 'word_embedding'
            trained_vocab_emb = model.get_layer(
                embedding_layer_name).get_weights()[0]
            # merge trained vocab embedding with test OOV word embeddings
            merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
            merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
            merged_vocab_emb[len(vocab['word']):len(merged_vocab['word']
                                                    ), :] = test_vocab_emb
            for key in vocab:
                vocab_size[key] = len(merged_vocab[key])
            print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'],
                                           external=args["external_feat"],
                                           norm_weight=args['norm_weight'],
                                           cos_norm=args['cos'],
                                           only_word=args['only_word'],
                                           only_char=args['only_char'],
                                           pooling=args['pooling'],
                                           deeplevel=args['deeplevel'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        # print(new_model.summary())

        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if not args["only_char"] and layer.name != embedding_layer_name:
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)
    elif mode == 'dssm':
        getter = K.function([model.layers[0].input, model.layers[1].input],
                            model.layers[-2].output)
        print('create DSSM functional getter...')
        num_samples, _, _ = test_dataset['query_3gram_input'].shape
        batch_size = 128
        num_batch = int(math.ceil(num_samples * 1.0 / batch_size))
        predictions = np.zeros((num_samples, ))
        for i in range(num_batch):
            start_idx, end_idx = i * batch_size, min(num_samples,
                                                     (i + 1) * batch_size)
            predictions[start_idx:end_idx] = getter([
                test_dataset['query_3gram_input'][start_idx:end_idx],
                test_dataset['doc_3gram_input'][start_idx:end_idx]
            ])[:, 0]

    #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']])
    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
Beispiel #19
0
def main(options):
    args = get_default_args()
    set_args(args, options)
    mode, dataset_name = args['mode'], args['dataset']

    # default setting
    args['raw_data'] = "data/%s/" % args['dataset']
    args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset']
    print_args(args)

    # get train/val/test names for specific dataset
    train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset(
        args)

    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}}
    test_vocab = {'word': {}, '3gram': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s_%s" %
                 (mode, dataset_name, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset, _, _, _, _, _ = load_data(
                "%s/%s/%s" % (args["experimental_data"], data_name, val_name),
                False)
        if args['embedding'] == 'glove':
            train_vocab_emb, test_vocab_emb = construct_vocab_emb(
                "%s/%s" % (args["experimental_data"], data_name),
                vocab['word'],
                test_vocab['word'],
                300,
                "word",
                base_embed_path=args["base_embed_path"],
                type=args["embedding"])
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, num_classes, args)
        print("create training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset = gen_data(args["raw_data"], val_set, vocab,
                                   test_vocab, False, max_query_len,
                                   max_doc_len, max_url_len, num_classes, args)
            print("create validation set successfully...")

        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                num_classes, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            save_data("%s/%s/%s" %
                      (args["experimental_data"], data_name, val_name),
                      False,
                      val_dataset,
                      vocab=test_vocab,
                      vocab_emb=test_vocab_emb)
            print("save val set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if dataset_name == 'twitter' or dataset_name == 'TwitterURL':
        val_split = args['val_split']
        num_samples, _ = train_dataset["query_word_input"].shape
        # randomly sample queries and all their documents if query_random is True
        # otherwise, query-doc pairs are randomly sampled
        query_random = True if dataset_name == 'twitter' else False
        if query_random:
            del train_dataset["overlap_feat"]
            val_indices = sample_aaai_val_set(args["raw_data"], train_set,
                                              val_split)
        else:
            val_split = 0.1
            val_indices, val_set = [], set()
            for i in range(int(num_samples * val_split)):
                val_index = np.random.randint(num_samples)
                while val_index in val_set:
                    val_index = np.random.randint(num_samples)
                val_indices.append(val_index)
                val_set.add(val_index)

        val_dataset = {}
        for key in train_dataset:
            #print(key, train_dataset[key].shape)
            val_dataset[key] = train_dataset[key][val_indices]
            train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        if train_dataset[key].size == 0:
            continue
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # merge the vocabulory of train and test set
    merged_vocab = {}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'],
                                            test_vocab['3gram'])
    print("TRAIN vocab: word(%d) 3gram(%d)" %
          (len(vocab['word']), len(vocab['3gram'])))
    print("TEST vocab: word(%d) 3gram(%d)" %
          (len(test_vocab['word']), len(test_vocab['3gram'])))
    print("MERGED vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(merged_vocab['3gram'])))

    vocab_inv, vocab_size = {}, {}
    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    # create model
    model = create_attention_model(max_query_len,
                                   max_doc_len,
                                   max_url_len,
                                   vocab_size,
                                   train_vocab_emb,
                                   args["nb_filters"],
                                   args["nb_layers"],
                                   embed_size=300,
                                   dropout_rate=args['dropout'],
                                   trainable=args["trainable"],
                                   weighting=args['weighting'],
                                   mask=args["mask"],
                                   conv_option=args['conv_option'],
                                   model_option=args['model_option'],
                                   join=args['join'],
                                   num_classes=num_classes,
                                   with_url=with_url,
                                   highway=args['highway'],
                                   att=args['co_attention'],
                                   ext_feat=args["external_feat"],
                                   encoder_option=args['encoder_option'])
    model_name = (
        "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s"
        % (mode, train_name, args['model_option'], args["encoder_option"],
           args['conv_option'], args["nb_filters"], args["nb_layers"],
           args["trainable"], args['dropout'], args['weighting'], args['mask'],
           args['batch_size'], args['val_split'], args['join'],
           args['highway'], args['co_attention'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    if num_classes <= 2:
        model.compile(loss='binary_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    else:
        print('compile model with categorical cross-entropy')
        model.compile(loss='categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    class_weight = None
    if args['dataset'] == 'Quora':
        #class_weight = {0:1, 1:2}
        print('apply class weight:', class_weight)

    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        model.fit(
            train_dataset,
            train_dataset['sim'],  #validation_split=0.05,
            batch_size=args['batch_size'],
            validation_data=(val_dataset, val_dataset['sim']),
            epochs=args['epochs'],
            shuffle=False,
            callbacks=[checkpoint, lr_reducer, early_stopping],
            class_weight=class_weight,
            verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    # load trained vocab embedding.
    trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0]
    # merge trained vocab embedding with test OOV word embeddings
    merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
    merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
    merged_vocab_emb[
        len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
    for key in vocab:
        vocab_size[key] = len(merged_vocab[key])
    print(vocab_size)

    new_model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       merged_vocab_emb,
                                       args["nb_filters"],
                                       args["nb_layers"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       join=args['join'],
                                       num_classes=num_classes,
                                       with_url=with_url,
                                       highway=args['highway'],
                                       att=args['co_attention'],
                                       ext_feat=args["external_feat"],
                                       encoder_option=args['encoder_option'])
    new_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    #print(new_model.summary())
    for layer_id in range(len(model.layers)):
        layer = model.layers[layer_id]
        if layer.name != 'word-embedding':
            new_model.layers[layer_id].set_weights(layer.get_weights())
    print('copy weight done.')
    val_predictions = new_model.predict(val_dataset)
    predictions = new_model.predict(test_dataset)

    if dataset_name == 'twitter' or dataset_name == 'TrecQA':
        val_predictions = val_predictions[:, 1]
        predictions = predictions[:, 1]
        print(predictions[:10])
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            for i in range(test_dataset['id'].shape[0]):
                f.write("%s %.4f %s\n" %
                        (test_dataset['id'][i], predictions[i], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write(
                    "%s %.4f %s\n" %
                    (val_dataset['id'][i], val_predictions[i], args['mode']))
        map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"])
        print('write val predictions with trec format to %s' %
              val_predictions_file)
        print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
        map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
        print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
    else:
        preds = np.argmax(predictions, axis=-1)
        labels = np.argmax(test_dataset['sim'], axis=-1)
        corrects = preds == labels
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            f.write("id label pred prob model\n")
            for i in range(len(preds)):
                f.write("%s %s %s %.4f %s\n" %
                        (test_dataset['id'][i], labels[i], preds[i],
                         predictions[i][preds[i]], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_preds = np.argmax(val_predictions, axis=-1)
        val_labels = np.argmax(val_dataset['sim'], axis=-1)
        val_corrects = val_preds == val_labels
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write("%s %s %s %.4f %s\n" %
                        (val_dataset['id'][i], val_labels[i], val_preds[i],
                         val_predictions[i][val_preds[i]], args['mode']))
        print('write val predictions with trec format to %s' %
              val_predictions_file)

        print('val accuracy: %.4f' %
              (np.count_nonzero(val_corrects) * 1.0 / len(val_preds)))
        print('accuracy: %.4f' %
              (np.count_nonzero(corrects) * 1.0 / len(preds)))
        macro_prec = precision_score(labels, preds, average="macro")
        macro_recall = recall_score(labels, preds, average="macro")
        print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (macro_prec, macro_recall, 2 * macro_prec * macro_recall /
               (macro_prec + macro_recall)))
        print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (precision_score(labels, preds, average="micro"),
               recall_score(labels, preds, average="micro"),
               f1_score(labels, preds, average="micro")))
        print('Confusion matrix:', confusion_matrix(labels, preds))
Beispiel #20
0
def main(options):
    args = get_default_args()
    load_best_args(args, options, get_best_args())
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = test_name
    print('train_set: {}, test_set: {}'.format(train_set, test_set))
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], [test_set], vocab,
                                test_vocab, False, max_query_len, max_doc_len,
                                max_url_len, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val_set(args["raw_data"], train_set, val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle: id {}, sim {}, query_word_input'.format(
        train_dataset['id'][:3], train_dataset['sim'][:3],
        train_dataset['query_word_input'][:3]))

    # merge the vocabulory of train and test set
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(test_vocab['3gram'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:])
        model.fit(train_dataset,
                  train_dataset['sim'],
                  validation_data=(val_dataset, val_dataset['sim']),
                  batch_size=args['batch_size'],
                  epochs=args['epochs'],
                  shuffle=False,
                  callbacks=[checkpoint, lr_reducer, early_stopping],
                  verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0]
        # merge trained vocab embedding with test OOV word embeddings
        merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
        merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
        merged_vocab_emb[
            len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
        for key in vocab:
            vocab_size[key] = len(merged_vocab[key])
        print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        print(new_model.summary())
        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if layer.name != 'sequential_2':
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)

    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
Beispiel #21
0
def main(argv):
    ''' Parse args, init dataloader '''
    foldNum, dataset, subtitle, rating_file, usr2labels_file = parseArgs(
        argv[:4], **dict(arg.split('=') for arg in argv[4:]))
    if rating_file and usr2labels_file:
        dataloader = DATA2LOADER[dataset](
            rating_file=rating_file,
            usr2labels_file=usr2labels_file,
            sub=subtitle,
        )
    else:
        dataloader = DATA2LOADER[dataset]()
    ''' Load training conifgs '''
    NEG_SAMPLE_NUM, \
        ITEM_FIELDS_NUM, \
        MAX_TRAIN_NUM, \
        LEARNING_RATE, \
        MOMENTUM, \
        LAMBDA = dataloader.getTrainingConf()
    ''' Load each usr's BOI (and for valid data) '''
    usr2itemsIndx, ind2itemNum = dataloader.load()
    usrs = map(lambda usr: usr, usr2itemsIndx)
    ''' Assert enough usrs '''
    if foldNum > len(usrs):
        s = ' '.join(['foldNum: ', str(foldNum), '>', 'usrNums:', str(usrs)])
        raise Exception(s)
    ''' Acquire (for all usrs) usr2labels & usr2NonzeroCols '''
    usr2labels, usr2NonzeroCols = dataloader.get_labels(usrs)
    ''' Init Baseupdator '''
    baseupdator = Baseupdator(*dataloader.getTrainingConf())
    ''' K-fold validation '''
    kfolds = splitKfolds(usr2itemsIndx, foldNum)
    for ind, fold in enumerate(kfolds):
        # Init train/valid folds
        usr2itemsIndxValid = fold
        usr2itemsIndxTrain = {}
        for tind, tfold in enumerate(kfolds):
            if ind != tind:
                usr2itemsIndxTrain = merge_two_dicts(usr2itemsIndxTrain, tfold)

        # Init statevalidator
        statevalidator = DATA2VALIDATOR[dataset](
            dataset=dataset,
            datasetSub=dataloader.getDataSub(),
            curFold=ind,
            totalFolds=len(kfolds),
            usr2itemsIndxTrain=usr2itemsIndxTrain,
            usr2itemsIndxValid=usr2itemsIndxValid,
            MAX_TRAIN_NUM=MAX_TRAIN_NUM,
            ITEM_FIELDS_NUM=ITEM_FIELDS_NUM,
        )
        statevalidator.logFoldInfo()
        ''' acquire (k times) usr2NegativeSamples & usr2negsNonzeroCols '''
        cdfByLabels, labelsList = getDistribution(usr2labels)
        usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
            usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
        logging.info('usr2NegativeSamples, usr2negsNonzeroCols created')
        ''' init V to [-1, 1) '''
        numOfItems = len(ind2itemNum)
        V = 2 * nprandom.rand(numOfItems, ITEM_FIELDS_NUM) - 1
        logging.info('V inited, V.shape == ' + str(V.shape) +
                     ' == (num items, itemFeatures length)')
        ''' init W to [-1, 1); init pooler'''
        # Warn: assume ITEM_FIELDS_NUM is the same as usr's representation's dimension
        # (No dimension reduction in pooler!)
        totalLabelsNum = dataloader.gettotalLabelsNum()
        W = 2 * nprandom.rand(ITEM_FIELDS_NUM, totalLabelsNum) - 1
        pooler = sample_pooler()
        logging.info('W & pooler inited, W.shape == ' + str(W.shape) +
                     ' == (itemFeatures length, total labels num)')
        logging.debug(' '.join(['W', str(W)]))
        logging.debug(' '.join(['V', str(V)]))
        ''' learn W, V '''
        while statevalidator.notConv():
            # Init next run
            statevalidator.nextRun()

            # NegSampling or not
            if statevalidator.shouldNegSample():
                statevalidator.logStartNegSample()
                usr2NegativeSamples, usr2negsNonzeroCols = negativeSample(
                    usr2labels, cdfByLabels, labelsList, k=NEG_SAMPLE_NUM)
                statevalidator.logNegSampleInfo(usr2NegativeSamples)

            for usrid in usr2itemsIndxTrain:
                # Pooling
                usr_rep = pooler.pool_all(usr2itemsIndxTrain[usrid], V)

                # Get y, sumedW(for y AND negs), sigmoids(for y AND negs)
                y, y_nonzeroCols, itemsIndx, sumedW_y, sigmoid_y, \
                    y_negsNonzeroCols, sumedW_negs, sigmoids_negs, \
                    sigmoidedSumedW = baseupdator.getTerms(
                        usrid,
                        usr2labels,
                        usr2NonzeroCols,
                        usr2itemsIndxTrain,
                        W,
                        usr_rep,
                        usr2negsNonzeroCols,)

                # Get gradient of Wq (i.e. q-th column of W)
                gradsOfW = baseupdator.getGradsOfW(
                    W,
                    y_nonzeroCols,
                    sigmoid_y,
                    usr_rep,
                    sigmoids_negs,
                    y_negsNonzeroCols,
                )

                # Get gradient of Vitem
                gradsOfV = baseupdator.getGradsOfV(
                    V,
                    itemsIndx,
                    sumedW_y,
                    sigmoid_y,
                    sigmoidedSumedW,
                )

                # Update W, V by usr, not by epoch
                # Update gradients to W, V
                W, V = baseupdator.updateByGradients(
                    W,
                    V,
                    gradsOfW,
                    gradsOfV,
                    statevalidator.incrInd,
                )

            # Reveal stats/predictions
            if statevalidator.shouldRevealStats():
                # Cal loss if needed
                if statevalidator.shouldCalLoss():
                    loss = baseupdator.getLoss(
                        W,
                        V,
                        usr2NonzeroCols,
                        usr2negsNonzeroCols,
                        usr2itemsIndxTrain,
                        pooler,
                    )
                    statevalidator.updateLossState(loss)
                    statevalidator.logLossStates(W, V, loss)

                # Do predictions
                statevalidator.logStartPrediction()
                dataStats = statevalidator.getDataStats(
                    usr2itemsIndxValid, usr2itemsIndxTrain, usr2NonzeroCols)
                for d in dataStats:
                    usr2itemsIndx = d['usr2itemsIndx']
                    u2predictions = d['u2predictions']
                    for usrid in usr2itemsIndx:
                        usr_rep = pooler.pool_all(usr2itemsIndx[usrid], V)
                        bestCols = baseupdator.predictLabels(
                            usr_rep, W, dataloader.getBds())
                        u2predictions[usrid] = bestCols

                # Collect Stats
                statevalidator.logCollectingStats()
                KPI2getters = {
                    'microF1': getMicroF1ByCol,
                    'oneError': getOneError,
                    'RL': getRL,
                    'coverage': getCoverage,
                    'avgPrec': getAvgPrecision,
                    'hammingLoss': getHammingLoss,
                }
                for d in dataStats:
                    KPIArgs = {
                        'W': W,
                        'V': V,
                        'usr2itemsIndx': d['usr2itemsIndx'],
                        'usr2NonzeroCols': usr2NonzeroCols,
                        'u2predictions': d['u2predictions'],
                        'totalLabelsNum': dataloader.gettotalLabelsNum(),
                        'rlPairsCnt': dataloader.getRLPairsCnt(),
                    }
                    d['KPIs'] = {
                        kpi: getter(KPIArgs)
                        for kpi, getter in KPI2getters.iteritems()
                    }
                    # OR (no write): statevalidator.logStats(d)
                    statevalidator.writeCSVStats(d)

                # Log real, predicted
                if not TEST_SNE:
                    for d in dataStats:
                        statevalidator.logRealPredictedVals(d)
    return 1
Beispiel #22
0
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len,
             max_doc_len, max_url_len, args):
    if is_train:
        vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX
        vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX
        vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX
        vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX
        vocab['url']['PAD_URL_INDEX'] = PAD_WORD_INDEX
        vocab['url']['OOV_URL_INDEX'] = OOV_WORD_INDEX
    query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], []
    all_url_list, all_ids_list, all_sim_list = [], [], []
    t0 = time.time()
    for data_name in datasets:  # there can be multiple data sets combined as the train or test data
        data_folder = "%s/%s" % (path, data_name)
        print('load dataset %s' % data_name)
        t = time.time()
        q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        url_list, max_url_len_dataset = read_urls("%s/url.txt" % data_folder,
                                                  vocab, is_train, '3gram')
        ids_list = read_metadata("%s/id.txt" % data_folder)
        if is_train:
            max_query_len['word'] = max(max_query_len['word'], max_q1_word_len)
            max_query_len['3gram'] = max(max_query_len['3gram'],
                                         max_q1_3gram_len)
            max_doc_len['word'] = max(max_doc_len['word'], max_q2_word_len)
            max_doc_len['3gram'] = max(max_doc_len['3gram'],
                                       min(max_q2_3gram_len, MAX_TWEET_LENGTH))
            max_url_len['url'] = max(max_url_len['url'],
                                     min(max_url_len_dataset, MAX_URL_LENGTH))
        sim_list = read_relevance("%s/sim.txt" % data_folder)
        query_word_list.extend(q1_word_list)
        doc_word_list.extend(q2_word_list)
        query_3gram_list.extend(q1_3gram_list)
        doc_3gram_list.extend(q2_3gram_list)
        all_url_list.extend(url_list)
        all_ids_list.extend(ids_list)
        all_sim_list.extend(sim_list)
        print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" %
              (max_q1_word_len, max_q2_word_len, max_query_len['word'],
               max_doc_len['word']))
        print(
            "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" %
            (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'],
             max_doc_len['3gram']))
        print('max_url_len: %d, limit: %d' %
              (max_url_len_dataset, max_url_len['url']))
        print('load dataset done: %d' % (time.time() - t))

    # question padding
    data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)}
    data['query_word_input'] = pad_sequences(query_word_list,
                                             maxlen=max_query_len['word'],
                                             value=PAD_WORD_INDEX,
                                             padding='post',
                                             truncating='post')
    data['query_word_mask'] = create_masks(data['query_word_input'], args)
    data['doc_word_input'] = pad_sequences(doc_word_list,
                                           maxlen=max_doc_len['word'],
                                           value=PAD_WORD_INDEX,
                                           padding='post',
                                           truncating='post')
    data['doc_word_mask'] = create_masks(data['doc_word_input'], args)
    data['query_3gram_input'] = pad_sequences(query_3gram_list,
                                              maxlen=max_query_len['3gram'],
                                              value=PAD_WORD_INDEX,
                                              padding='post',
                                              truncating='post')
    data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args)
    data['doc_3gram_input'] = pad_sequences(doc_3gram_list,
                                            maxlen=max_doc_len['3gram'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='post')
    data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args)
    data['url_3gram_input'] = pad_sequences(all_url_list,
                                            maxlen=max_url_len['url'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='pre')
    data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args)

    if os.path.exists("%s/collection_ngram_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_ngram_idf.json" % path, "r"))
        vocab_inv = invert_dict(vocab['3gram'])
        data['query_3gram_weight'] = inject_ngram_weight(
            data['query_3gram_input'], vocab_inv, weights)
        data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'],
                                                       vocab_inv, weights)
        vocab_inv = invert_dict(vocab['url'])
        data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'],
                                                       vocab_inv, weights)
        print('ngram weight injection done: %d' % (time.time() - t))

    if os.path.exists("%s/collection_word_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_word_idf.json" % path, "r"))
        merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word'])
        vocab_inv = invert_dict(merge_vocab)
        data['query_word_weight'] = inject_word_weight(
            data['query_word_input'], vocab_inv, weights)
        data['doc_word_weight'] = inject_word_weight(data['doc_word_input'],
                                                     vocab_inv, weights)
        data['overlap_feat'] = compute_overlap_feat(data['query_word_input'],
                                                    data['doc_word_input'],
                                                    vocab_inv, weights)
        print('word weight injection done: %d' % (time.time() - t))

    print('data creation is done: %d' % (time.time() - t0))
    return data
Beispiel #23
0
def main():
    
    

    opt = COptions(args)
    opt_t = COptions(args)
    

    loadpath = (opt.data_dir + "/" + opt.data_name) 
    print "loadpath:" + loadpath
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    wordtoix, ixtoword = x[3], x[4]

    if opt.test:
        test_file = opt.data_dir + opt.test_file 
        test = read_test(test_file, wordtoix)
        
    opt.n_words = len(ixtoword) 
    opt_t.n_words = len(ixtoword)
    opt_t.maxlen = opt_t.maxlen - opt_t.filter_shape + 1
    opt_t.update_params(args)
    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

  
    for d in ['/gpu:0']:
        with tf.device(d):
            src_ = [tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) for _ in range(opt.n_context)]
            tgt_ = tf.placeholder(tf.int32, shape=[opt_t.batch_size, opt_t.sent_len])
            
            is_train_ = tf.placeholder(tf.bool, name = 'is_train')
            res_1_ = get_features(src_, tgt_, is_train_, opt, opt_t)
            merged = tf.summary.merge_all()

    uidx = 0
    graph_options=tf.GraphOptions(build_cost_model=1)
    
    config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=graph_options)
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    run_metadata = tf.RunMetadata()

    with tf.Session(config = config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:
            try:       
                t_vars = tf.trainable_variables()  
                if opt.load_from_pretrain:
                    d_vars = [var for var in t_vars if var.name.startswith('d_')]
                    l_vars = [var for var in t_vars if var.name.startswith('l_')]
                    restore_from_save(d_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.global_d)
                    if opt.local_feature:
                        restore_from_save(l_vars, sess, opt, load_path = opt.restore_dir + "/save/" + opt.local_d)
                else:
                    loader = restore_from_save(t_vars, sess, opt, load_path = opt.save_path)

            except Exception as e:
                print 'Error: '+str(e)
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())
        loss_d , loss_g = 0, 0

        if opt.test:
            iter_num = np.int(np.floor(len(test)/opt.batch_size))+1 
            z_all, z_all_l = [], []
            for i in range(iter_num):
                test_index = range(i * opt.batch_size,(i+1) * opt.batch_size)
                sents = [test[t%len(test)] for t in test_index]
                src = [[sents[i][0] for i in range(opt.batch_size)]]
                tgt = [sents[i][0] for i in range(opt.batch_size)]
                x_batch = [prepare_data_for_cnn(src_i, opt) for src_i in src] 
                print "Source:" + u' '.join([ixtoword[x] for s in x_batch for x in s[0] if x != 0]).encode('utf-8').strip()
                y_batch = prepare_data_for_rnn(tgt, opt_t, is_add_GO = False) 
                feed = merge_two_dicts( {i: d for i, d in zip(src_, x_batch)}, {tgt_: y_batch, is_train_: 0}) 
                res_1 = sess.run(res_1_, feed_dict=feed)
                z_all.extend(res_1['z'])  
                z_all_l.extend(res_1['z_l'])                        

            save_path_z = opt.log_path + '.global.z.txt'
            print save_path_z
            if os.path.exists(save_path_z):
                os.remove(save_path_z) 
            with open(save_path_z, "a") as myfile:
                for line in z_all[:len(test)]:
                    for z_it in line:
                        myfile.write(str(z_it) + '\t')
                    myfile.write('\n')
            
            save_path_z = opt.log_path + '.local.z.txt'
            print save_path_z
            if os.path.exists(save_path_z):
                os.remove(save_path_z) 
            with open(save_path_z, "a") as myfile:
                for line in z_all_l[:len(test)]:
                    for z_it in line:
                        myfile.write(str(z_it) + '\t')
                    myfile.write('\n')