def load_from_db(circus_name): logging.info("loading circus {}".format(circus_name)) namespace_folder = db.namespace_folder(namespace=circus_name) config_file = os.path.join(namespace_folder, "circus_config.json") with open(config_file, "r") as config_h: config = json.load(config_h) clock_config = { "start": pd.Timestamp(config["clock_config"]["start"]), "step_duration": pd.Timedelta( str(config["clock_config"]["step_duration"])) } circus = Circus(name=circus_name, master_seed=config["master_seed"], **clock_config) for population_id in db.list_populations(namespace=circus_name): circus.load_population(population_id) for gen_type, gen_id in db.list_generators(namespace=circus_name): circus.load_generator(gen_type=gen_type, gen_id=gen_id) return circus
def save_pos_as_partial_ids_csv(circus, params): target_file = os.path.join(db.namespace_folder(circus.name), "pos_id_msisdn.csv") # Right now, all POS sell all products, so they will all appear for all # products in the reference table for partial_ids pos_df = circus.actors["pos"].to_dataframe().reset_index() pos_df = pos_df.rename(columns={ "MONGO_ID": "id", "index": "partial_id", "CONTACT_PHONE": "msisdn", })[["id", "partial_id", "msisdn"]] partial_ids_df = pd.DataFrame( columns=["id", "partial_id", "msisdn", "product_type_id"]) for product in params["products"].keys(): tmp_df = pos_df tmp_df["product_type_id"] = product partial_ids_df = partial_ids_df.append(tmp_df) partial_ids_df = partial_ids_df.reindex_axis( ["id", "product_type_id", "partial_id", "msisdn"], axis=1) partial_ids_df.to_csv(target_file, index=False)
def save_providers_csv(circus, params): target_file = os.path.join(db.namespace_folder(circus.name), "distributor_agent_product.csv") pos_df = circus.actors["pos"].to_dataframe().reset_index() providers_df = pd.DataFrame( columns=["distributor_id", "agent_id", "product_type_id"]) for product in params["products"].keys(): """Add LINK between POS and DIST_L1""" pos_dist_l2 = circus.actors["pos"] \ .relationships["{}__provider".format(product)].get_relations() dist_l2_dist_l1 = circus.actors["dist_l2"] \ .relationships["{}__provider".format(product)].get_relations() # join dist_l1 responsible for pos pos_dist_l1 = pd.merge(left=pos_dist_l2, right=dist_l2_dist_l1[["from", "to"]], left_on="to", right_on="from", suffixes=('_pos', '_dist_l1')) # join pos mongo id pos_dist_l1 = pd.merge(left=pos_dist_l1, right=pos_df[["index", "MONGO_ID"]], left_on="from_pos", right_on="index") pos_dist_l1 = pos_dist_l1.rename(columns={ "MONGO_ID": "agent_id", "to_dist_l1": "distributor_id" }) pos_dist_l1["product_type_id"] = product pos_dist_l1 = pos_dist_l1.reindex_axis( ["distributor_id", "agent_id", "product_type_id"], axis=1) providers_df = providers_df.append(pos_dist_l1) """Add LINK between DIST_L2 and DIST_L1""" dist_l2_dist_l1 = dist_l2_dist_l1.rename(columns={ "from": "agent_id", "to": "distributor_id" }) dist_l2_dist_l1["product_type_id"] = product dist_l2_dist_l1 = dist_l2_dist_l1.reindex_axis( ["distributor_id", "agent_id", "product_type_id"], axis=1) providers_df = providers_df.append(dist_l2_dist_l1) providers_df.to_csv(target_file, index=False)
def save_pos_as_mobile_sync_csv(circus): target_file = os.path.join(db.namespace_folder(circus.name), "points_of_interest.csv") logging.info( "generating a mobile-sync csv pos file in {}".format(target_file)) pos_df = circus.actors["pos"].to_dataframe().reset_index() sites_df = circus.actors["sites"].to_dataframe() pos_df = pd.merge(left=pos_df, right=sites_df[["GEO_LEVEL_1", "GEO_LEVEL_2"]], left_on="SITE", right_index=True) pos_df = pos_df.rename( columns={ "MONGO_ID": "id", "index": "agent_code", "AGENT_NAME": "name", "CONTACT_NAME": "contact_name", "CONTACT_PHONE": "contact_phone_number", "LONGITUDE": "longitude", "LATITUDE": "latitude", "GEO_LEVEL_1": "geo_level_1", "GEO_LEVEL_2": "geo_level_2" } )\ .drop([ "ATTRACT_BASE", "ATTRACTIVENESS", "ATTRACT_DELTA", "SITE" ], axis=1) pos_df["pos_type"] = "grocery store" pos_df["pos_channel"] = "franchise" for col in ["picture_uri", "geo_level_3", "geo_level_4", "geo_level_5"]: pos_df[col] = "some_{}".format(col) for bcol in [ "is_pos", "electronic_recharge_activity_flag", "physical_recharge_activity_flag", "sim_activity_flag", "handset_activity_flag", "mfs_activity_flag" ]: pos_df[bcol] = True pos_df = pos_df.reindex_axis([ "id", "name", "latitude", "longitude", "agent_code", "geo_level_1", "geo_level_2", "geo_level_3", "geo_level_4", "geo_level_5", "is_pos", "contact_name", "contact_phone_number", "pos_type", "pos_channel", "picture_uri", "electronic_recharge_activity_flag", "physical_recharge_activity_flag", "sim_activity_flag", "handset_activity_flag", "mfs_activity_flag" ], axis=1) pos_df.to_csv(target_file, index=False)
def save_params_to_db(self, params_type, params): """ Saves the params object to the circus folder in the DB for future reference :param params_type: "build", "run" or "target" :param params: the params object """ target_file = os.path.join(db.namespace_folder(self.name), "params_{}.json".format(params_type)) with open(target_file, "w") as outfile: json.dump(params, outfile)
def create_distl1_daily_targets(product, write_mode): """ Create some fake sellin and sellout target per distributor l1 based on actual. """ target_file = os.path.join( db.namespace_folder(circus_name), "distributor_product_sellin_sellout_target.csv") logging.info(" producing sellin sellout target for dist_l1s in {}".format( target_file)) # contains info on dist_l1 bulk purchases (dist_l1 buys from telco) # TIME, BUYER_ID, SELLER_ID, OLD_BUYER_STOCK, NEW_BUYER_STOCK, BULK_SIZE input_file_name = "output/{}/dist_l1_" \ "{}_bulk_purchase_stock.csv".format( circus_name, product ) bulk_purchases = pd.read_csv(input_file_name, parse_dates=[0]) bulk_purchases["day"] = bulk_purchases["TIME"].apply( lambda s: s.strftime("%D")) mean_daily_sells = bulk_purchases\ .groupby(["BUYER_ID", "day"])["BULK_SIZE"]\ .agg({"target_units": len, "target_value": np.sum})\ .groupby(level=0)\ .median() for direction in ["sellin", "sellout"]: for metric in ["target_units", "target_value"]: col = "_".join([direction, metric]) if metric == "target_units": mean_daily_sells[col] = noisified(mean_daily_sells, col=metric, lb=1, col_type=np.int) else: mean_daily_sells[col] = noisified(mean_daily_sells, col=metric, lb=100) mean_daily_sells.drop(["target_units", "target_value"], axis=1, inplace=True) mean_daily_sells.reset_index(inplace=True) mean_daily_sells["product_type_id"] = product mean_daily_sells.rename(columns={"BUYER_ID": "distributor_id"}, inplace=True) to_csv(mean_daily_sells, target_file, write_mode)
def create_distl1_daily_geo_targets(product, write_mode, nrows=None): """ Create some fake daily geo_l2 target per product/distributor """ target_file = os.path.join(db.namespace_folder(circus_name), "distributor_product_geol2_sellout_target.csv") logging.info(" producing geo_l2 sellout target for dist_l1s in {}".format( target_file)) # contains info on dist_l1 bulk purchases (dist_l1 buys from telco) # CUST_ID, SITE, POS, CELL_ID, geo_level2_id, distributor_l1, INSTANCE_ID, # PRODUCT_ID,FAILED_SALE_OUT_OF_STOCK,TX_ID,VALUE,TIME input_file_name = "output/{}/customer_{}_purchase.csv".format( circus_name, product) customer_purchases = pd.read_csv(input_file_name, parse_dates=[11], nrows=nrows) customer_purchases["day"] = customer_purchases["TIME"].apply( lambda s: s.strftime("%D")) customer_purchases["product_type_id"] = product mean_daily_sells = customer_purchases \ .groupby(["product_type_id", "distributor_l1", "geo_level2_id", "day"])["VALUE"] \ .agg({"sellout_target_units": len, "sellout_target_value": np.sum}) \ .groupby(level=[0, 1, 2]) \ .median()\ .reset_index() mean_daily_sells = mean_daily_sells.rename( columns={"distributor_l1": "distributor_id"}) mean_daily_sells["sellout_target_units"] = noisified( mean_daily_sells, col="sellout_target_units", lb=25, col_type=np.int) mean_daily_sells["sellout_target_value"] = noisified( mean_daily_sells, col="sellout_target_value", lb=100) to_csv(mean_daily_sells, target_file, write_mode)
def build_site_product_pos_target(circus, params): """ Generates some random target of amount of pos per site, based on the actual number of pos per site """ target_file = os.path.join(db.namespace_folder(circus.name), "site_product_pos_target.csv") sites = circus.actors["sites"] target_action = operations.Chain( sites.relationships["POS"].ops.get_neighbourhood_size( from_field="site_id", named_as="pos_count_target"), operations.FieldLogger(log_id="logs")) sites_df = pd.DataFrame({"site_id": sites.ids}) _, logs = target_action(sites_df) target_df = logs["logs"] target_df["cartesian_product"] = "cp" products = pd.DataFrame({ "product_type_id": params["products"].keys(), "cartesian_product": "cp" }) target_df = pd.merge(left=target_df, right=products, on="cartesian_product") fact = np.random.normal(1, .1, size=target_df.shape[0]) target_df["pos_count_target"] = target_df["pos_count_target"] * fact target_df["pos_count_target"] = target_df["pos_count_target"].astype( np.int) target_df.ix[target_df["pos_count_target"] < 10, "pos_count_target"] = 10 target_df.drop(["cartesian_product"], axis=1, inplace=True) target_df.to_csv(target_file, index=False)