class Experiment03(object): """ Experiment03 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_03" self.config['RL_parameters']['experiment'] = self.expt_name @staticmethod def run_rl_training(config): rl_trainer = RunRLTrainingJob(config) data = rl_trainer.run() return data def run(self): """ Run experiment """ num_drivers = np.arange(1000, 6500, 500) objectives = ['pickups', 'revenue'] combinations = list(itertools.product(num_drivers, objectives)) # Create a pool of processes num_processes = mp.cpu_count() pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for comb in combinations: self.config['RL_parameters'][ 'experiment'] = self.expt_name + "_" + str(count) self.config['RL_parameters'][ 'city_states_filename'] = "city_states.dill" self.config['RL_parameters']['num_drivers'] = comb[0] self.config['RL_parameters']['num_strategic_drivers'] = comb[0] self.config['RL_parameters']['objective'] = comb[1] configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_03") results = pool.amap(self.run_rl_training, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_03") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.logger = logging.getLogger("cuda_logger") self.data_exporter = DataExporter(self.config)
class Experiment05(object): """ Experiment05 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_05" self.config['RL_parameters']['experiment'] = self.expt_name @staticmethod def run_rl_training(config): rl_trainer = RunRLTrainingJob(config) data = rl_trainer.run() return data def run(self): """ Run experiment """ num_drivers = self.config['RL_parameters']['num_drivers'] percent_strategic_drivers = np.arange(0, 1.1, 0.1) num_strategic_drivers = [int(x * num_drivers) for x in percent_strategic_drivers] # Create a pool of processes num_processes = mp.cpu_count() pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for drivers in num_strategic_drivers: self.config['RL_parameters']['experiment'] = self.expt_name + "_" + str(count) self.config['RL_parameters']['num_strategic_drivers'] = drivers configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_05") results = pool.amap(self.run_rl_training, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_05") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config)
def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_05" self.config['RL_parameters']['experiment'] = self.expt_name
def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: CreateCityStateJob\n") city_state_creator = CityStateCreator(self.config) city_state = city_state_creator.get_city_states() self.logger.info("Exporting city states\n") filename = self.config['city_state_creator'].get('filename', 'city_states.dill') data_exporter = DataExporter(self.config) data_exporter.export_city_state(city_state, filename) self.logger.info("Finished job: CreateCityStateJob")
def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: SparseMatrixFillerJob\n") sparse_matrix_filler = SparseMatrixFiller(self.config) city_state = sparse_matrix_filler.fill_matrices() self.logger.info("Exporting city states\n") filename = self.config['city_state_creator'].get( 'filename', 'city_states.dill') data_exporter = DataExporter(self.config) data_exporter.export_city_state(city_state, filename) self.logger.info("Finished job: SparseMatrixFillerJob")
def run(self): """ Execute the job :param: :return: """ self.logger.info("Starting job: GuruDataProcessor\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) # Read guru data df = data_provider.read_guru_user_data() df = df[[3, 4]] # Salary and skills columns df.columns = ['cost', 'skills'] df = df[(df.cost != "$0") & (df.skills != "UNKNOWN")] df = df.reset_index(drop=True) df = df.assign(user_id=df.index.values) df = df.assign(skills=df.apply(lambda x: x['skills'][:-1].split(','), axis=1)) # Convert cost to integers user_df = df.assign(cost=df.apply(lambda x: int(x['cost'][1:]), axis=1)) # Read skills data df = data_provider.read_guru_skill_data() df = df[[1]] df.columns = ['skill'] skill_df = df.assign(skill_id=df.index.values) # Create multilabel binarizer mlb = MultiLabelBinarizer(classes=skill_df.skill.values) # One hot encoding of user skills skills = mlb.fit_transform(user_df['skills']) # Create dataset users = user_df.to_dict('records') for i in range(len(users)): users[i]['skills_array'] = skills[i] # Export csv files data_exporter.export_csv_file(user_df, "guru/guru_user_df.csv") data_exporter.export_csv_file(skill_df, "guru/guru_skill_df.csv") # Scaling factor for submodular function scaling_factor = 1 # Create and export data object to be used in experiments # containing all methods related to guru data guru = GuruData(self.config, user_df, skill_df, users, scaling_factor) data_exporter.export_dill_file(guru, "guru/guru_data.dill") self.logger.info("Finished job: GuruDataProcessor")
def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: BuildRegressionModelsJob\n") # Export the regression models data for model in self.config['regression_models']: year = model['year'][-2:] month = model['month'].lower() weekday = list(calendar.day_name).index(model['weekday']) (self.logger.info( "Creating regression model data {}-{}-{}s".format( year, month, model['weekday']))) rmb = RegressionModelBuilder(self.config, year, month, weekday) dist_df = rmb.get_bins_distance_dataframe() trips_df = rmb.create_trips_data_with_distance() (self.logger.info( "Exporting regression model data {}-{}-{}s\n".format( year, month, model['weekday']))) data_exporter = DataExporter(self.config) data_exporter.export_bin_distances(dist_df) data_exporter.export_rmb_data( trips_df, model['weekday'] + '_' + month + '_' + year) self.logger.info("Finished job: BuildRegressionModelsJob")
class RunRLTrainingJob(object): """ This class implements a job to run the RL training """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.logger = logging.getLogger("cuda_logger") self.data_exporter = DataExporter(self.config) def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: RunRLTrainingJob\n") self.logger.info("RL training parameters:") pp = pprint.PrettyPrinter(indent=4) self.logger.info(pp.pprint(self.config['RL_parameters'])) # Create RL trainer rl_trainer = RLTrainer(self.config) # Runs RL episodes result = rl_trainer.run() best_episode = result[0] best_model = result[1] training_tracker = result[2] # Export best_model best_model_filename = self.config['RL_parameters'].get( 'best_model_filename', False) best_episode_filename = self.config['RL_parameters'].get( 'best_episode_filename', False) training_tracker_filename = self.config['RL_parameters'].get( 'training_tracker_filename', False) if best_model_filename: self.data_exporter.export_model(best_model, best_model_filename) if best_episode_filename: self.data_exporter.export_episode(best_episode, best_episode_filename) if training_tracker_filename: self.data_exporter.export_training_tracker( training_tracker, training_tracker_filename) self.logger.info("Finished job: RunRLTrainingJob") return best_episode
def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: NeighborhoodDataExportJob\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_bins = hex_attr_df['hex_id'].values data = {} for r in xrange(self.radius + 1): data[r] = {} for hex_bin in hex_bins: neighbors = hex_neighborhood(hex_bin, hex_attr_df, r) zero_vector = np.zeros(len(hex_bins)) np.put(zero_vector, neighbors, 1) one_hot_encoding_vector = zero_vector data[r][hex_bin] = one_hot_encoding_vector data_exporter.export_neighborhood_data(data) self.logger.info("Finished job: NeighborhoodDataExportJob")
class Experiment01(object): """ Experiment01 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_01" self.config['RL_parameters']['experiment'] = self.expt_name @staticmethod def run_rl_training(config): rl_trainer = RunRLTrainingJob(config) data = rl_trainer.run() return data def run(self): """ Run experiment """ ind_percent = np.arange(0., 1.1, 0.1) reb_percent = np.arange(0., 1.1, 0.1) ind_percent[0] = 0.01 reb_percent[0] = 0.01 combinations = list(itertools.product(ind_percent, reb_percent)) num_episodes = self.config['RL_parameters']['num_episodes'] # Create a pool of processes num_processes = mp.cpu_count() pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for comb in combinations: self.config['RL_parameters']['experiment'] = self.expt_name + "_" + str(count) ind_episodes = int(comb[0] * num_episodes) reb_episodes = int(comb[1] * num_episodes) if (ind_episodes + reb_episodes) < num_episodes: self.config['RL_parameters']['ind_episodes'] = ind_episodes self.config['RL_parameters']['reb_episodes'] = reb_episodes configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_01") results = pool.amap(self.run_rl_training, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_01") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
class Experiment02(object): """ Experiment02 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self,data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 02") self.expt_config = self.config['experiment_configs']['experiment_02'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [0.4,1] seeds = [i for i in range(6,10)] ks = [1,5,10,15,20,25,30,35,40,45,50] sampling_epsilon_values_stochastic = [0.1,0.05,0.01,0.005] error_epsilon_values_scaled_threshold = [0.2,0.15,0.1,0.05] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info("Experiment for user sample ratio: {} and scaling factor: {} and seed: {}".format(user_sample_ratio,scaling_factor,seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold,popular_threshold, user_sample_ratio, seed) self.logger.info("Scaling factor for submodular function is: {}".format(scaling_factor)) # Distorted Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("distorted_greedy",k,end - start)) self.logger.info("\n") # Stochastic Distorted Greedy for k in ks: for sample_epsilon in sampling_epsilon_values_stochastic: # Run algorithm start = timer() result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("stochastic_distorted_greedy",sample_epsilon,k,end - start)) self.logger.info("\n") # Cost Scaled Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_greedy",k,end - start)) self.logger.info("\n") # Cost scaled lazy exact greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_lazy_greedy",k,end - start)) self.logger.info("\n") # Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("greedy",k,end - start)) self.logger.info("\n") # Scaled Single Threshold Greedy for k in ks: for error_epsilon in error_epsilon_values_scaled_threshold: # Run algorithm start = timer() result = alg.run(self.config, data, "scaled_single_threshold_greedy", None, error_epsilon, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("scaled_single_threshold_greedy",error_epsilon,k,end - start)) self.logger.info("\n") # Baseline Top k for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("baseline_topk",k,end - start)) self.logger.info("\n") self.logger.info("Finished experiment 02") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file(df, "experiment_02_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_02 results")
class Experiment00(object): """ Experiment00 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self, data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 00") self.expt_config = self.config['experiment_configs']['experiment_00'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [ 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ] seeds = [i for i in range(6, 11)] sampling_epsilon_values = [0.1, 0.05, 0.01, 0.005] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {}" .format(user_sample_ratio, scaling_factor, seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # # Create controlled samples dataset # data.sample_skills_to_be_covered_controlled(num_sampled_skills, rare_sample_fraction, # popular_sample_fraction, rare_threshold, # popular_threshold, user_sample_ratio) # # Setting scaling factor of coverage as coverage(S)/cost(S) for set cover solution S # self.set_scaling_factor(data) self.logger.info( "Scaling factor for submodular function is: {}".format( scaling_factor)) # Distorted greedy - ICML start = timer() result = alg.run(config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled greedy start = timer() result = alg.run(config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled lazy exact greedy start = timer() result = alg.run(config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Greedy start = timer() result = alg.run(config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "greedy", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained Linear start = timer() result = alg.run(config, data, "unconstrained_linear", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_linear", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained distorted greedy start = timer() result = alg.run(config, data, "unconstrained_distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Stochastic distorted greedy for sample_epsilon in sampling_epsilon_values: start = timer() config['algorithms']['stochastic_distorted_greedy_config'][ 'epsilon'] = sample_epsilon result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("stochastic_distorted_greedy", sample_epsilon, None, end - start)) results.append(result) self.logger.info("\n") # Baseline top k start = timer() result = alg.run(config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk", None, end - start)) results.append(result) self.logger.info("Finished experiment 00") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_00_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_00 results")
class Experiment04(object): """ Experiment04 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_04" self.config['RL_parameters']['experiment'] = self.expt_name @staticmethod def run_rl_training(config): rl_trainer = RunRLTrainingJob(config) try: data = rl_trainer.run() except BaseException: print config raise ValueError return data def run(self): """ Run experiment """ num_drivers = np.arange(1000, 6500, 500) thresholds = np.arange(5, 55, 5) thresholds = np.insert(thresholds, 0, 2) combinations = list(itertools.product(num_drivers, thresholds)) # Create a pool of processes num_processes = mp.cpu_count() self.logger.info("Processes: {}".format(num_processes)) pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for comb in combinations: self.config['RL_parameters'][ 'experiment'] = self.expt_name + "_" + str(count) self.config['RL_parameters']['num_drivers'] = comb[0] self.config['RL_parameters']['imbalance_threshold'] = comb[1] configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_04") results = pool.amap(self.run_rl_training, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_04") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
def __init__(self, config): self.config = config self.logger = logging.getLogger("baseline_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config)
class Experiment06(object): """ Experiment06 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_06" self.config['RL_parameters']['experiment'] = self.expt_name @staticmethod def run_rl_training(config): rl_trainer = RunRLTrainingJob(config) data = rl_trainer.run() return data def run(self): """ Run experiment """ days = [ 'Sunday_00_', 'Monday_00_', 'Tuesday_00_', 'Wednesday_00_', 'Thursday_00_', 'Friday_00_', 'Saturday_00_', 'Sunday_01_', 'Monday_01_', 'Tuesday_01_', 'Wednesday_01_', 'Thursday_01_', 'Friday_01_', 'Saturday_01_', 'Sunday_02_', 'Monday_02_', 'Tuesday_02_', 'Wednesday_02_', 'Thursday_02_', 'Friday_02_', 'Saturday_02_', 'Sunday_03_', 'Monday_03_', 'Tuesday_03_', 'Wednesday_03_', 'Thursday_03_', 'Friday_03_', 'Saturday_03_' ] # Create a pool of processes num_processes = mp.cpu_count() pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for day in days: self.config['RL_parameters'][ 'experiment'] = self.expt_name + "_" + str(count) self.config['RL_parameters'][ 'city_states_filename'] = day + 'city_states.dill' self.config['RL_parameters'][ 'best_model_filename'] = day + 'model.dill' configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_06") results = pool.amap(self.run_rl_training, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_06") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
def run(self): """ Execute the job :param: :return: """ self.logger.info("Starting job: FreelancerDataProcessor\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) # Read freelancer data df = data_provider.read_freelancer_user_data() df_cost = df[[1]] # Salary/Hour df_skills = df[df.columns[4::2]] df_skills.replace(to_replace=["Other Skills"], value="", inplace=True) df_skills = (df_skills.iloc[:, 0].map(str) + ',' + df_skills.iloc[:, 1].map(str) + ',' + df_skills.iloc[:, 2].map(str) + ',' + df_skills.iloc[:, 3].map(str) + ',' + df_skills.iloc[:, 4].map(str) + ',' + df_skills.iloc[:, 5].map(str)) # Skills user_df = pd.DataFrame() user_df['cost'] = df_cost.iloc[:, 0].tolist() # Converting all strings to lower case user_df['skills'] = df_skills.str.lower().tolist() user_df = user_df.reset_index(drop=True) user_df = user_df.assign(user_id=user_df.index.values) user_df = user_df.assign(skills=user_df.apply( lambda x: x['skills'][:-1].split(','), axis=1)) # Convert cost to integers user_df.cost = user_df.cost.astype(int) # Read skills data df = data_provider.read_freelancer_skill_data() df = df[[1]] df.columns = ['skill'] skill_df = df.assign(skill_id=df.index.values) # Create multilabel binarizer mlb = MultiLabelBinarizer(classes=skill_df.skill.values) # One hot encoding of user skills skills = mlb.fit_transform(user_df['skills']) # Create dataset users = user_df.to_dict('records') for i in range(len(users)): users[i]['skills_array'] = skills[i] # Export csv files data_exporter.export_csv_file(user_df, "freelancer/freelancer_user_df.csv") data_exporter.export_csv_file(skill_df, "freelancer/freelancer_skill_df.csv") # Scaling factor for submodular function scaling_factor = 1 # Create and export data object to be used in experiments # containing all methods related to freelancer data freelancer = FreelancerData(self.config, user_df, skill_df, users, scaling_factor) data_exporter.export_dill_file(freelancer, "freelancer/freelancer_data.dill") self.logger.info("Finished job: FreelancerDataProcessor")
class Experiment08(object): """ Experiment08 class """ def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.data_exporter = DataExporter(self.config) self.logger = logging.getLogger("cuda_logger") self.expt_name = "expt_08" self.config['Model_testing']['experiment'] = self.expt_name @staticmethod def run_rl_testing(config): rl_tester = RunRLTestingJob(config) data = rl_tester.run() return data def run(self): """ Run experiment """ days = [ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] weeks_of_month = ['00', '01', '02', '03', '04'] imbalance_thresholds = [2] model_num_drivers = [4000, 5000, 6000, 7000, 8000, 9000, 10000] test_combinations = [] for model_day in days: for model_wom in weeks_of_month: for model_threshold in imbalance_thresholds: for model_drivers in model_num_drivers: model_args = [ model_day, model_wom, str(model_drivers), str(model_threshold) ] model_filename = "_".join(model_args) + "_model.dill" if os.path.isfile(self.config['app']['DATA_DIR'] + 'models/' + model_filename): for test_wom in weeks_of_month: for test_drivers in range( model_drivers - 3000, model_drivers + 4000, 1000): test_file = model_day + '_' + test_wom + '_city_states.dill' if os.path.isfile( self.config['app']['DATA_DIR'] + 'city_states/' + test_file): test_combinations.append({ 'model': model_filename, 'test_dow': model_day, 'test_wom': test_wom, 'test_drivers': test_drivers }) self.logger.info("Total test combinations: {}".format( len(test_combinations))) # Create a pool of processes num_processes = mp.cpu_count() pool = ProcessPool(nodes=num_processes) configs = [] count = 0 for comb in test_combinations: self.config['Model_testing'][ 'experiment'] = self.expt_name + "_" + str(count) self.config['Model_testing']['city_states_filename'] = ( comb['test_dow'] + '_' + comb['test_wom'] + '_city_states.dill') self.config['Model_testing']['model_filename'] = comb['model'] self.config['RL_parameters']['num_drivers'] = comb['test_drivers'] self.config['RL_parameters']['num_strategic_drivers'] = comb[ 'test_drivers'] configs.append(deepcopy(self.config)) count += 1 self.logger.info("Starting expt_08") results = pool.amap(self.run_rl_testing, configs).get() pool.close() pool.join() pool.clear() self.logger.info("Finished expt_08") # Export best episode self.data_exporter.export_episode(results, self.expt_name + ".dill")
class Experiment04(object): """ Experiment04 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self, data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 04") self.expt_config = self.config['experiment_configs']['experiment_04'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] num_of_partitions = self.expt_config['num_of_partitions'] partition_type = self.expt_config['partition_type'] cardinality_constraint = self.expt_config['cardinality_constraint'] user_sample_ratios = [1] seeds = [i for i in range(6, 10)] cardinality_constraints = [i for i in range(1, 11)] num_of_partitions = [i for i in range(1, 6)] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.8 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: for cardinality_constraint in cardinality_constraints: for num_of_partition in num_of_partitions: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and cardinality constraint:{} and num of partitions:{} " .format(user_sample_ratio, scaling_factor, seed, cardinality_constraint, num_of_partition)) # Load dataset data = self.data_provider.read_guru_data_obj() config = self.config.copy() # Creating the ground set of users alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # Assigning users to partitions uniformly at random alg.create_partitions(data, num_of_partition, partition_type, cardinality_constraint) self.logger.info( "Scaling factor for submodular function is: {}". format(scaling_factor)) # Partition matroid greedy start = timer() result = alg.run( config, data, "partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid lazy exact greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Baseline Top k start = timer() result = alg.run(config, data, "baseline_topk_matroid", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk_matroid", None, end - start)) results.append(result) self.logger.info("\n") self.logger.info("Finished experiment 04") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_04_guru_salary_pop08_rare01.csv") self.logger.info("Exported experiment 04 results")
class BaselineDriver(object): def __init__(self, config): self.config = config self.logger = logging.getLogger("baseline_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_baseline(baseline_config): baseline_name = baseline_config['name'] baseline_count = baseline_config['count'] config = baseline_config['config'] city_states = baseline_config['city_states'] episode_rewards = [] if baseline_name == "cDQN": baseline = cDQN(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'cDQN', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) if baseline_name == "cA2C": baseline = cA2C(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'cA2C', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) if baseline_name == "A2C": baseline = A2C(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'A2C', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) return episode_rewards def run(self): self.logger.info("Starting baselines") city_states = self.data_provider.read_city_states() baseline_list = self.config['baselines']['baseline_list'] # Create a pool of processes num_processes = mp.cpu_count() self.logger.info("Processes: {}".format(num_processes)) pool = ProcessPool(nodes=num_processes) configs = [] for count in range(10): for name in baseline_list: configs.append({ 'name': name, 'count': count, 'config': self.config, 'city_states': city_states }) results = pool.amap(self.run_baseline, configs).get() pool.close() pool.join() pool.clear() episode_rewards = [] for result in results: episode_rewards += result self.data_exporter.export_baseline_data(episode_rewards) self.logger.info("Finished baselines")