Esempio n. 1
0
def create_model(session, data_set, forward_only):
	"""Create model and initialize or load parameters in session."""
	click_model = None
	with open(FLAGS.click_model_json) as fin:
		model_desc = json.load(fin)
		click_model = cm.loadModelFromJson(model_desc)
	p_estimator = None
	with open(FLAGS.estimator_json) as fin:
		data = json.load(fin)
		if 'IPW_list' in data: # Radomized estimator
			p_estimator = pe.RandomizedPropensityEstimator(FLAGS.estimator_json)
		else: # Oracle estimator
			p_estimator = pe.OraclePropensityEstimator(cm.loadModelFromJson(data))
	
	model = IPWrank(click_model, p_estimator, FLAGS.use_non_clicked_data, data_set.rank_list_size, 
		data_set.embed_size, FLAGS.batch_size, FLAGS.hparams, forward_only, FLAGS.feed_previous)

	ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
	if ckpt:
		print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
		model.saver.restore(session, ckpt.model_checkpoint_path)
	else:
		print("Created model with fresh parameters.")
		session.run(tf.global_variables_initializer())
	return model
Esempio n. 2
0
def main():
    click_model_json_file = sys.argv[1]
    data_dir = sys.argv[2]
    output_file = sys.argv[3]

    print("Load data from " + data_dir)
    train_set = data_utils.read_data(data_dir, 'train')
    click_model = None
    with open(click_model_json_file) as fin:
        model_desc = json.load(fin)
        click_model = CM.loadModelFromJson(model_desc)
    print("Estimating...")
    estimator = RandomizedPropensityEstimator()
    estimator.estimateParametersFromModel(click_model, train_set)
    print("Output results...")
    estimator.outputEstimatorToFile(output_file)
def create_model(session, data_set, forward_only):
	"""Create model and initialize or load parameters in session."""
	click_model = None
	with open(FLAGS.click_model_json) as fin:
		model_desc = json.load(fin)
		click_model = cm.loadModelFromJson(model_desc)
	
	model = DLA(click_model, data_set.rank_list_size, 
		data_set.embed_size, FLAGS.batch_size, FLAGS.hparams, forward_only, FLAGS.feed_previous)

	ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
	if ckpt:
		print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
		model.saver.restore(session, ckpt.model_checkpoint_path)
	else:
		print("Created model with fresh parameters.")
		session.run(tf.global_variables_initializer())
	return model
Esempio n. 4
0
def main():
    CLICK_MODEL_JSON = sys.argv[1]
    # the folder where the input data can be found
    INPUT_DATA_PATH = sys.argv[2]
    # the folder where output should be stored
    OUTPUT_PATH = sys.argv[3]
    # how many results to show in the results page of the ranker
    # this should be equal or smaller than the rank cut when creating the data
    RANK_CUT = int(sys.argv[4])

    with open(CLICK_MODEL_JSON) as fin:
        model_desc = json.load(fin)
        click_model = cm.loadModelFromJson(model_desc)

    # process dataset from file
    train_set = data_utils.read_data(INPUT_DATA_PATH, 'train', RANK_CUT)
    click_log, relevance_log, feature_log = generate_clicks(
        1000000, click_model, train_set.gold_weights, train_set.featuredids)
    timeit_results = timeit.Timer(
        partial(generate_clicks, 1000000, click_model, train_set.gold_weights,
                train_set.featuredids)).repeat(10, 1)
Esempio n. 5
0
import click_models
import data_utils
import json
import os
import sys

if __name__ == "__main__":
    data_dir = '../'

    click_model_file = sys.argv[1]
    click_model_fname = os.path.basename(click_model_file)
    click_model_fname = os.path.splitext(click_model_fname)[0]
    print(click_model_file, click_model_fname)

    click_model = click_models.loadModelFromJson(click_model_file)
    target = '../test_data/'

    train_set = data_utils.parse_data(click_model=click_model,
                                      data_dir=data_dir + 'generate_dataset/',
                                      task='eval',
                                      ti='train',
                                      tp=click_model_fname + '_train',
                                      rank_cut=100000,
                                      target=target)

    test_set = data_utils.parse_data(click_model=click_model,
                                     data_dir=data_dir + 'generate_dataset/',
                                     task='eval',
                                     ti='test',
                                     tp=click_model_fname + '_test',
                                     rank_cut=100000,
Esempio n. 6
0
    fout1.close()
    fout2.close()
    fout3.close()
    fout4.close()

    return train_set


if __name__ == "__main__":
    data_dir = '../'
    fi = sys.argv[1]
    name = fi.split('/')[-1]
    name = name[:name.find('.')]
    print(fi, name)
    model_desc = json.load(open(fi))
    click_model = cm.loadModelFromJson(model_desc)
    target = './tmp/'

    train_set = parse_data(click_model=click_model,
                           data_dir=data_dir + 'generate_dataset/',
                           ti='train',
                           tp=name + '_train',
                           rank_cut=100000,
                           target=target)
    test_set = parse_data(click_model=click_model,
                          data_dir=data_dir + 'generate_dataset/',
                          ti='test',
                          tp=name + '_test',
                          rank_cut=100000,
                          target=target)
Esempio n. 7
0
 def loadEstimatorFromFile(self, file_name):
     with open(file_name) as data_file:
         data = json.load(data_file)
         self.click_model = CM.loadModelFromJson(data['click_model'])
         self.IPW_list = data['IPW_list']
     return
Esempio n. 8
0
File: main.py Progetto: TvanZ/IR2
def main():
    # the click model in json format as exported when creating a model with the click_models.py
    CLICK_MODEL_JSON = sys.argv[1]
    MODEL_NAME = 'GANoN'
    DATASET_NAME = 'set1'
    # the folder where the input data can be found
    INPUT_DATA_PATH = sys.argv[2]
    # the folder where output should be stored
    OUTPUT_PATH = sys.argv[3]
    # how many results to show in the results page of the ranker
    # this should be equal or smaller than the rank cut when creating the data
    RANK_CUT = int(sys.argv[4])
    SET_NAME = ['train', 'test', 'valid']
    BATCH_SIZE = 1024
    EMBED_SIZE = 700
    with open(CLICK_MODEL_JSON) as fin:
        model_desc = json.load(fin)
        click_model = cm.loadModelFromJson(model_desc)

    for set_name in SET_NAME:
        if not os.path.exists(OUTPUT_PATH + set_name + '/'):
            os.makedirs(OUTPUT_PATH + set_name + '/')

    # Determine if a gpu is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # process dataset from file
    train_set = data_utils.read_data(INPUT_DATA_PATH, 'train', RANK_CUT)
    valid_set = data_utils.read_data(INPUT_DATA_PATH, 'valid', RANK_CUT)

    # Open clicks pickle if it exists and generate is set to False
    # otherwise generate new clicks
    GENERATE = False
    try:
        assert not GENERATE
        pickled_clicks = pickle.load(open("train_clicks.p", "rb"))
        click_logs, rankings, features = zip(*pickled_clicks)
        print("Opened the train pickle!")
        pickled_clicks = pickle.load(open("valid_clicks.p", "rb"))
        v_click_logs, v_rankings, v_features = zip(*pickled_clicks)
        print("Opened the validation pickle!")
    except:
        click_logs, rankings, features = generate_clicks(
            1000000, click_model, train_set.gold_weights,
            train_set.featuredids)
        print("Train clicks generated!")
        zipped_all = zip(click_logs, rankings, features)
        pickle.dump(zipped_all, open("train_clicks.p", "wb"))
        print("Saved train clicks in a pickle!")
        v_click_logs, v_rankings, v_features = generate_clicks(
            1000000, click_model, valid_set.gold_weights,
            valid_set.featuredids)
        print("Valid clicks generated!")
        zipped_all = zip(v_click_logs, v_rankings, v_features)
        pickle.dump(zipped_all, open("valid_clicks.p", "wb"))
        print("Saved valid clicks in a pickle!")

    PBM_settings_Hardcoded = {
        "model_filename": "gan_hardcoded.p",
        "g": {
            'generator': Generator1,
            'input_size': 2,
            'hidden_size': 2,
            'output_size': 10,
            'fn': nn.Sigmoid
        },
        "d": {
            'hidden_size': 50,
            'output_size': 1,
            'fn': nn.Sigmoid,
            'feature_size': 1,
            'embed_size': 1,
        },
        "feature": False
    }

    PBM_settings_Learned_Relevance = {
        "model_filename": "gan_learned_relevance.p",
        "g": {
            'generator': Generator2,
            'input_size': 1,
            'hidden_size': 20,
            'output_size': 2,
            'fn': nn.Sigmoid,
        },
        "d": {
            'hidden_size': 50,
            'output_size': 1,
            'fn': nn.Sigmoid,
            'feature_size': 1,
            'embed_size': 1,
        },
        "feature": False
    }
    PBM_settings_Learned_Features = {
        "model_filename": "gan_learned_features.p",
        "g": {
            'generator': Generator2,
            'input_size': 700,
            'hidden_size': 32,
            'output_size': 2,
            'fn': nn.Sigmoid
        },
        "d": {
            'hidden_size': 32,
            'output_size': 1,
            'fn': nn.Sigmoid,
            'feature_size': 700,
            'embed_size': 4,
        },
        "feature": True
    }

    def run(model_settings,
            load_from_file=False,
            BATCH_SIZE=BATCH_SIZE,
            EMBED_SIZE=EMBED_SIZE,
            RANK_CUT=RANK_CUT,
            click_logs=click_logs,
            rankings=rankings,
            features=features,
            v_click_logs=v_click_logs,
            v_rankings=v_rankings,
            v_features=v_features):

        current_best = np.inf
        model_filename = model_settings["model_filename"]

        g_optimizer = optim.Adam
        d_optimizer = optim.Adam

        gan = GAN(click_model, 10, BATCH_SIZE, model_settings, g_optimizer,
                  d_optimizer)
        if load_from_file:
            ckpt = torch.load(model_filename)
            gan.G.load_state_dict(ckpt["g_state_dict"])
            gan.D.load_state_dict(ckpt["d_state_dict"])
            current_best = ckpt["best_eval"]
        gan.to(device)
        print(
            'perfect opbservations: tensor([1, 0.5, 0.333, 0.25, 0.2, 0.167, 0.1429, 0.125, 0.1111, 0.1])'
        )
        num_epochs = 100
        real_errors, fake_errors, g_errors, eval_errors = [], [], [], []
        for epoch in range(num_epochs):
            # Train the model
            for mini_batch in get_minibatch(
                    BATCH_SIZE, EMBED_SIZE, RANK_CUT,
                    list(zip(click_logs, rankings, features))):
                click_logs_T, rankings_T, features_T = mini_batch
                if not model_settings['feature']:
                    real_error, fake_error, g_error = gan.train(
                        click_logs_T, rankings_T)
                else:
                    real_error, fake_error, g_error = gan.train(
                        click_logs_T, features_T)

            # Evaluate the model and store it if it performs better than previous models
            eval_error = 0
            nr_of_batches = 0
            for mini_batch in get_minibatch(
                    BATCH_SIZE, EMBED_SIZE, RANK_CUT,
                    list(zip(v_click_logs, v_rankings, v_features))):
                nr_of_batches += 1
                click_logs_T, rankings_T, features_T = mini_batch
                if not model_settings['feature']:
                    eval_error += gan.evaluate(click_logs_T, rankings_T)
                else:
                    eval_error += gan.evaluate(click_logs_T, features_T)
            eval_error = -eval_error / nr_of_batches

            # Save the model parameters.
            # Important! Saves parameters for G and D separately
            # When loading the parameters, also do this for G and D separately
            if eval_error < current_best:
                current_best = eval_error
                ckpt = {
                    "g_state_dict": gan.G.state_dict(),
                    "d_state_dict": gan.D.state_dict(),
                    "best_eval": current_best,
                    "best_epoch": epoch
                }
                torch.save(ckpt, model_filename)

            # Get the current observations model from the gan
            observation_alphas = gan.G.binary_approximator.alpha.data
            observation_betas = gan.G.binary_approximator.beta.data
            observations = 1 - CDFKuma(
                observation_alphas, observation_betas, threshold=0.5)
            # OLD METHOD: frequentist approach
            # with torch.no_grad():
            # 	if model_settings['feature']:
            # 		rankings_T = features_T
            # 	observations, clicks = gan.G(rankings_T)
            # observations=torch.mean(observations,dim=0)
            print('observations:', observations)

            print(
                f"[{epoch + 1}/{num_epochs}] | Loss D: {(real_error + fake_error)/2} | Loss G: {g_error} | Eval Loss: {eval_error}"
            )
            real_errors.append(real_error)
            fake_errors.append(fake_error)
            g_errors.append(g_error)
            eval_errors.append(eval_error)

        print('real_errors', real_errors)
        print('fake_errors', fake_errors)
        print('g_errors', g_errors)

        return real_errors, fake_errors, g_errors

    real, fake, g = run(PBM_settings_Learned_Features)
    print(real, fake, g)