def create_model(session, data_set, forward_only): """Create model and initialize or load parameters in session.""" click_model = None with open(FLAGS.click_model_json) as fin: model_desc = json.load(fin) click_model = cm.loadModelFromJson(model_desc) p_estimator = None with open(FLAGS.estimator_json) as fin: data = json.load(fin) if 'IPW_list' in data: # Radomized estimator p_estimator = pe.RandomizedPropensityEstimator(FLAGS.estimator_json) else: # Oracle estimator p_estimator = pe.OraclePropensityEstimator(cm.loadModelFromJson(data)) model = IPWrank(click_model, p_estimator, FLAGS.use_non_clicked_data, data_set.rank_list_size, data_set.embed_size, FLAGS.batch_size, FLAGS.hparams, forward_only, FLAGS.feed_previous) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def main(): click_model_json_file = sys.argv[1] data_dir = sys.argv[2] output_file = sys.argv[3] print("Load data from " + data_dir) train_set = data_utils.read_data(data_dir, 'train') click_model = None with open(click_model_json_file) as fin: model_desc = json.load(fin) click_model = CM.loadModelFromJson(model_desc) print("Estimating...") estimator = RandomizedPropensityEstimator() estimator.estimateParametersFromModel(click_model, train_set) print("Output results...") estimator.outputEstimatorToFile(output_file)
def create_model(session, data_set, forward_only): """Create model and initialize or load parameters in session.""" click_model = None with open(FLAGS.click_model_json) as fin: model_desc = json.load(fin) click_model = cm.loadModelFromJson(model_desc) model = DLA(click_model, data_set.rank_list_size, data_set.embed_size, FLAGS.batch_size, FLAGS.hparams, forward_only, FLAGS.feed_previous) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def main(): CLICK_MODEL_JSON = sys.argv[1] # the folder where the input data can be found INPUT_DATA_PATH = sys.argv[2] # the folder where output should be stored OUTPUT_PATH = sys.argv[3] # how many results to show in the results page of the ranker # this should be equal or smaller than the rank cut when creating the data RANK_CUT = int(sys.argv[4]) with open(CLICK_MODEL_JSON) as fin: model_desc = json.load(fin) click_model = cm.loadModelFromJson(model_desc) # process dataset from file train_set = data_utils.read_data(INPUT_DATA_PATH, 'train', RANK_CUT) click_log, relevance_log, feature_log = generate_clicks( 1000000, click_model, train_set.gold_weights, train_set.featuredids) timeit_results = timeit.Timer( partial(generate_clicks, 1000000, click_model, train_set.gold_weights, train_set.featuredids)).repeat(10, 1)
import click_models import data_utils import json import os import sys if __name__ == "__main__": data_dir = '../' click_model_file = sys.argv[1] click_model_fname = os.path.basename(click_model_file) click_model_fname = os.path.splitext(click_model_fname)[0] print(click_model_file, click_model_fname) click_model = click_models.loadModelFromJson(click_model_file) target = '../test_data/' train_set = data_utils.parse_data(click_model=click_model, data_dir=data_dir + 'generate_dataset/', task='eval', ti='train', tp=click_model_fname + '_train', rank_cut=100000, target=target) test_set = data_utils.parse_data(click_model=click_model, data_dir=data_dir + 'generate_dataset/', task='eval', ti='test', tp=click_model_fname + '_test', rank_cut=100000,
fout1.close() fout2.close() fout3.close() fout4.close() return train_set if __name__ == "__main__": data_dir = '../' fi = sys.argv[1] name = fi.split('/')[-1] name = name[:name.find('.')] print(fi, name) model_desc = json.load(open(fi)) click_model = cm.loadModelFromJson(model_desc) target = './tmp/' train_set = parse_data(click_model=click_model, data_dir=data_dir + 'generate_dataset/', ti='train', tp=name + '_train', rank_cut=100000, target=target) test_set = parse_data(click_model=click_model, data_dir=data_dir + 'generate_dataset/', ti='test', tp=name + '_test', rank_cut=100000, target=target)
def loadEstimatorFromFile(self, file_name): with open(file_name) as data_file: data = json.load(data_file) self.click_model = CM.loadModelFromJson(data['click_model']) self.IPW_list = data['IPW_list'] return
def main(): # the click model in json format as exported when creating a model with the click_models.py CLICK_MODEL_JSON = sys.argv[1] MODEL_NAME = 'GANoN' DATASET_NAME = 'set1' # the folder where the input data can be found INPUT_DATA_PATH = sys.argv[2] # the folder where output should be stored OUTPUT_PATH = sys.argv[3] # how many results to show in the results page of the ranker # this should be equal or smaller than the rank cut when creating the data RANK_CUT = int(sys.argv[4]) SET_NAME = ['train', 'test', 'valid'] BATCH_SIZE = 1024 EMBED_SIZE = 700 with open(CLICK_MODEL_JSON) as fin: model_desc = json.load(fin) click_model = cm.loadModelFromJson(model_desc) for set_name in SET_NAME: if not os.path.exists(OUTPUT_PATH + set_name + '/'): os.makedirs(OUTPUT_PATH + set_name + '/') # Determine if a gpu is available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # process dataset from file train_set = data_utils.read_data(INPUT_DATA_PATH, 'train', RANK_CUT) valid_set = data_utils.read_data(INPUT_DATA_PATH, 'valid', RANK_CUT) # Open clicks pickle if it exists and generate is set to False # otherwise generate new clicks GENERATE = False try: assert not GENERATE pickled_clicks = pickle.load(open("train_clicks.p", "rb")) click_logs, rankings, features = zip(*pickled_clicks) print("Opened the train pickle!") pickled_clicks = pickle.load(open("valid_clicks.p", "rb")) v_click_logs, v_rankings, v_features = zip(*pickled_clicks) print("Opened the validation pickle!") except: click_logs, rankings, features = generate_clicks( 1000000, click_model, train_set.gold_weights, train_set.featuredids) print("Train clicks generated!") zipped_all = zip(click_logs, rankings, features) pickle.dump(zipped_all, open("train_clicks.p", "wb")) print("Saved train clicks in a pickle!") v_click_logs, v_rankings, v_features = generate_clicks( 1000000, click_model, valid_set.gold_weights, valid_set.featuredids) print("Valid clicks generated!") zipped_all = zip(v_click_logs, v_rankings, v_features) pickle.dump(zipped_all, open("valid_clicks.p", "wb")) print("Saved valid clicks in a pickle!") PBM_settings_Hardcoded = { "model_filename": "gan_hardcoded.p", "g": { 'generator': Generator1, 'input_size': 2, 'hidden_size': 2, 'output_size': 10, 'fn': nn.Sigmoid }, "d": { 'hidden_size': 50, 'output_size': 1, 'fn': nn.Sigmoid, 'feature_size': 1, 'embed_size': 1, }, "feature": False } PBM_settings_Learned_Relevance = { "model_filename": "gan_learned_relevance.p", "g": { 'generator': Generator2, 'input_size': 1, 'hidden_size': 20, 'output_size': 2, 'fn': nn.Sigmoid, }, "d": { 'hidden_size': 50, 'output_size': 1, 'fn': nn.Sigmoid, 'feature_size': 1, 'embed_size': 1, }, "feature": False } PBM_settings_Learned_Features = { "model_filename": "gan_learned_features.p", "g": { 'generator': Generator2, 'input_size': 700, 'hidden_size': 32, 'output_size': 2, 'fn': nn.Sigmoid }, "d": { 'hidden_size': 32, 'output_size': 1, 'fn': nn.Sigmoid, 'feature_size': 700, 'embed_size': 4, }, "feature": True } def run(model_settings, load_from_file=False, BATCH_SIZE=BATCH_SIZE, EMBED_SIZE=EMBED_SIZE, RANK_CUT=RANK_CUT, click_logs=click_logs, rankings=rankings, features=features, v_click_logs=v_click_logs, v_rankings=v_rankings, v_features=v_features): current_best = np.inf model_filename = model_settings["model_filename"] g_optimizer = optim.Adam d_optimizer = optim.Adam gan = GAN(click_model, 10, BATCH_SIZE, model_settings, g_optimizer, d_optimizer) if load_from_file: ckpt = torch.load(model_filename) gan.G.load_state_dict(ckpt["g_state_dict"]) gan.D.load_state_dict(ckpt["d_state_dict"]) current_best = ckpt["best_eval"] gan.to(device) print( 'perfect opbservations: tensor([1, 0.5, 0.333, 0.25, 0.2, 0.167, 0.1429, 0.125, 0.1111, 0.1])' ) num_epochs = 100 real_errors, fake_errors, g_errors, eval_errors = [], [], [], [] for epoch in range(num_epochs): # Train the model for mini_batch in get_minibatch( BATCH_SIZE, EMBED_SIZE, RANK_CUT, list(zip(click_logs, rankings, features))): click_logs_T, rankings_T, features_T = mini_batch if not model_settings['feature']: real_error, fake_error, g_error = gan.train( click_logs_T, rankings_T) else: real_error, fake_error, g_error = gan.train( click_logs_T, features_T) # Evaluate the model and store it if it performs better than previous models eval_error = 0 nr_of_batches = 0 for mini_batch in get_minibatch( BATCH_SIZE, EMBED_SIZE, RANK_CUT, list(zip(v_click_logs, v_rankings, v_features))): nr_of_batches += 1 click_logs_T, rankings_T, features_T = mini_batch if not model_settings['feature']: eval_error += gan.evaluate(click_logs_T, rankings_T) else: eval_error += gan.evaluate(click_logs_T, features_T) eval_error = -eval_error / nr_of_batches # Save the model parameters. # Important! Saves parameters for G and D separately # When loading the parameters, also do this for G and D separately if eval_error < current_best: current_best = eval_error ckpt = { "g_state_dict": gan.G.state_dict(), "d_state_dict": gan.D.state_dict(), "best_eval": current_best, "best_epoch": epoch } torch.save(ckpt, model_filename) # Get the current observations model from the gan observation_alphas = gan.G.binary_approximator.alpha.data observation_betas = gan.G.binary_approximator.beta.data observations = 1 - CDFKuma( observation_alphas, observation_betas, threshold=0.5) # OLD METHOD: frequentist approach # with torch.no_grad(): # if model_settings['feature']: # rankings_T = features_T # observations, clicks = gan.G(rankings_T) # observations=torch.mean(observations,dim=0) print('observations:', observations) print( f"[{epoch + 1}/{num_epochs}] | Loss D: {(real_error + fake_error)/2} | Loss G: {g_error} | Eval Loss: {eval_error}" ) real_errors.append(real_error) fake_errors.append(fake_error) g_errors.append(g_error) eval_errors.append(eval_error) print('real_errors', real_errors) print('fake_errors', fake_errors) print('g_errors', g_errors) return real_errors, fake_errors, g_errors real, fake, g = run(PBM_settings_Learned_Features) print(real, fake, g)