def test_select_one_from_many_times_same_id_should_yield_different_results(): op = four_to_plenty.ops.select_one(from_field="DEALER", named_as="SIM", one_to_one=True) # Many customer selected the same dealer and want to get a sim from them. # We expect each of the 2 selected dealer to sell a different SIM to each story_data = pd.DataFrame( { "DEALER": ["a", "a", "b", "a", "b", "a", "b", "a", "a", "a"], }, index=build_ids(size=10, prefix="c", max_length=2)) result, logs = op(story_data) logging.info("selected") assert {} == logs assert ["DEALER", "SIM"] == result.columns.tolist() # There could be collisions that reduce the same of the resulting index, # but there should not be only collisions, leading to only "a" and "b" assert result.shape[0] > 3 g = result.groupby("DEALER")["SIM"] assert len(np.unique(g.get_group("a").values)) > 1 assert len(np.unique(g.get_group("b").values)) > 1
def test_make_random_assign_shoud_assign_each_element_only_once(): dealers = build_ids(size=10, prefix="DEALER_", max_length=2) sims = build_ids(size=1000, prefix="SIM_", max_length=4) assignment = make_random_assign(set1=sims, set2=dealers, seed=10) # all sims should have been assigned assert assignment.shape == (1000, 2) # all SIM should have been given assert set(assignment["set1"].unique().tolist()) == set(sims) # all owners should be part of the dealers assert set( assignment["chosen_from_set2"].unique().tolist()) <= set(dealers)
def create_dealers_and_sims_stock(self): """ Create the DEALER population together with their init SIM stock """ logging.info("Creating dealer and their SIM stock ") dealers = self.create_population(name="dealers", size=params["n_dealers"], ids_gen=SequencialGenerator( prefix="DEALER_", max_length=3)) # SIM relationship to maintain some stock sims = dealers.create_relationship(name="SIM") sim_ids = build_ids(size=params["n_init_sims_dealer"], prefix="SIM_") sims_dealer = make_random_assign(set1=sim_ids, set2=dealers.ids, seed=next(self.seeder)) sims.add_relations(from_ids=sims_dealer["chosen_from_set2"], to_ids=sims_dealer["set1"]) # one more dealer with just 3 sims in stock => this one will trigger # lot's of failed sales broken_dealer = pd.DataFrame({ "DEALER": "broke_dealer", "SIM": ["SIM_OF_BROKE_DEALER_%d" % s for s in range(3)] }) sims.add_relations(from_ids=broken_dealer["DEALER"], to_ids=broken_dealer["SIM"]) return dealers
def generate(self, size): # forcing size as int, also making sure we never get floating point # values in ids (can happen if size results from some scaling) size_i = int(size) # size_i = size values = build_ids(size_i, self.counter, self.prefix, self.max_length) self.counter += size_i return values
def test_select_many_operation_should_join_subsets_of_relationships(): # same test as above, but from the operation story_data = pd.DataFrame( { "let": ["a", "b", "c", "b", "a"], "how_many": [4, 5, 6, 7, 8] }, index=build_ids(5, prefix="wh_", max_length=2)) select_op = four_to_plenty.ops.select_many( from_field="let", named_as="found", pop=False, quantity_field="how_many", discard_missing=False, ) selection, logs = select_op(story_data) # this index is expected among other things since it allows a direct # merge into the initial request assert selection.sort_index().index.equals(story_data.sort_index().index) assert selection.columns.tolist() == ["how_many", "let", "found"] # no capping should have occurred: four_to_plenty has largely enough assert selection["found"].apply(len).tolist() == [4, 5, 6, 7, 8] # every chosen element should be present at most once s = functools.reduce(lambda s1, s2: set(s1) | set(s2), selection["found"]) assert len(s) == np.sum([4, 5, 6, 7, 8]) # all relationships in wh00 must come from a a_tos = four_to_plenty.get_relations(["a"])["to"] for f in selection.loc["wh_00", "found"]: assert f in a_tos.values for f in selection.loc["wh_04", "found"]: assert f in a_tos.values b_tos = four_to_plenty.get_relations(["b"])["to"] for f in selection.loc["wh_01", "found"]: assert f in b_tos.values for f in selection.loc["wh_03", "found"]: assert f in b_tos.values c_tos = four_to_plenty.get_relations(["c"])["to"] for f in selection.loc["wh_02", "found"]: assert f in c_tos.values
def test_drop_should_remove_the_rows_where_condition_is_true_(): cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) cdrs.index = build_ids(12, prefix="ix_", max_length=2) cdrs["cond"] = ([True] * 3 + [False] * 3) * 2 rem = operations.DropRow(condition_field="cond") story_data, all_logs = rem(cdrs) kept_index = ["ix_03", "ix_04", "ix_05", "ix_09", "ix_10", "ix_11"] # 6 rows should have been removed assert story_data.shape == (6, 4) assert story_data.columns.tolist() == ["A", "B", "duration", "cond"] assert story_data["A"].equals(cdrs.loc[kept_index]["A"]) assert story_data["B"].equals(cdrs.loc[kept_index]["B"]) assert story_data["duration"].equals(cdrs.loc[kept_index]["duration"])
def test_select_many_should_return_subsets_of_relationships(): story_data_index = build_ids(5, prefix="cl_", max_length=1) # cheating with the seed for the second part of the test four_to_plenty.state = np.random.RandomState(18) selection = four_to_plenty.select_many( from_ids=pd.Series(["a", "b", "c", "b", "a"], index=story_data_index), named_as="selected_sets", # On purpose requesting non-integer quantities => these should be # rounded to int. It's very common to have them in practise, typically # when generating "bulk size" out of a non-integer distribution quantities=[4, 5, 6.5, 7.5, 8], remove_selected=False, discard_empty=False) # this index is expected among other things since it allows a direct # merge into the initial request assert sorted(selection.index.tolist()) == story_data_index assert selection.columns.tolist() == ["selected_sets"] # no capping should have occured: four_to_plenty has largely enough assert sorted( selection["selected_sets"].apply(len).tolist()) == [4, 5, 6, 7, 8] # every chosen elemnt should be persent at most once s = functools.reduce(lambda s1, s2: set(s1) | set(s2), selection["selected_sets"]) assert len(s) == np.sum([4, 5, 6, 7, 8]) # selecting the same thing => should return the same result since # remove_selected is False and the relationship is seeded four_to_plenty.state = np.random.RandomState(18) selection_again = four_to_plenty.select_many(from_ids=pd.Series( ["a", "b", "c", "b", "a"], index=story_data_index), named_as="selected_sets", quantities=[4, 5, 6, 7, 8], remove_selected=False, discard_empty=False) assert selection.sort_index().index.equals( selection_again.sort_index().index) for idx in selection.index: assert selection.ix[idx]["selected_sets"].tolist( ) == selection_again.ix[idx]["selected_sets"].tolist()
def test_select_many_with_drop_should_remove_elements(): story_data_index = build_ids(5, prefix="cl_", max_length=1) # makes a copy since we're going to drop some elements four_to_plenty_copy = Relationship(seed=1) for i in range(100): four_to_plenty_copy.add_relations( from_ids=["a", "b", "c", "d"], to_ids=["a_%d" % i, "b_%d" % i, "c_%d" % i, "d_%d" % i]) selection = four_to_plenty.select_many(from_ids=pd.Series( ["a", "b", "c", "b", "a"], index=story_data_index), named_as="selected_sets", quantities=[4, 5, 6, 7, 8], remove_selected=True, discard_empty=False) # makes sure all selected values have been removed for from_id in selection.index: for to_id in selection.ix[from_id]["selected_sets"].tolist(): rels = four_to_plenty_copy.get_relations(from_ids=[from_id]) assert to_id not in rels["to"]
def create_subs_and_sims(self): """ Creates the subs and sims + a relationship between them + an agent relationship. We have at least one sim per subs: sims.size >= subs.size The sims population contains the "OPERATOR", "MAIN_ACCT" and "MSISDN" attributes. The subs population has a "SIMS" relationship that points to the sims owned by each subs. The sims population also has a relationship to the set of agents where this sim can be topped up. """ npgen = RandomState(seed=next(self.seeder)) # subs are empty here but will receive a "CELLS" and "EXCITABILITY" # attributes later on subs = self.create_population( name="subs", size=self.params["n_subscribers"], ids_gen=SequencialGenerator(prefix="SUBS_")) number_of_operators = npgen.choice(a=range(1, 5), size=subs.size) operator_ids = build_ids(size=4, prefix="OPERATOR_", max_length=1) def pick_operators(qty): """ randomly choose a set of unique operators of specified size """ return npgen.choice(a=operator_ids, p=[.8, .05, .1, .05], size=qty, replace=False).tolist() # set of operators of each subs subs_operators_list = map(pick_operators, number_of_operators) # Dataframe with 4 columns for the 1rst, 2nd,... operator of each subs. # Since subs_operators_list don't all have the size, some entries of this # dataframe contains None, which are just discarded by the stack() below subs_operators_df = pd.DataFrame(data=list(subs_operators_list), index=subs.ids) # same info, vertically: the index contains the sub id (with duplicates) # and "operator" one of the operators of this subs subs_ops_mapping = subs_operators_df.stack() subs_ops_mapping.index = subs_ops_mapping.index.droplevel(level=1) # SIM population, each with an OPERATOR and MAIN_ACCT attributes sims = self.create_population( name="sims", size=subs_ops_mapping.size, ids_gen=SequencialGenerator(prefix="SIMS_")) sims.create_attribute("OPERATOR", init_values=subs_ops_mapping.values) recharge_gen = ConstantGenerator(value=1000.) sims.create_attribute(name="MAIN_ACCT", init_gen=recharge_gen) # keeping track of the link between population and sims as a relationship sims_of_subs = subs.create_relationship("SIMS") sims_of_subs.add_relations(from_ids=subs_ops_mapping.index, to_ids=sims.ids) msisdn_gen = MSISDNGenerator( countrycode="0032", prefix_list=["472", "473", "475", "476", "477", "478", "479"], length=6, seed=next(self.seeder)) sims.create_attribute(name="MSISDN", init_gen=msisdn_gen) # Finally, adding one more relationship that defines the set of possible # shops where we can topup each SIM. # TODO: to make this a bit more realistic, we should probably generate # such relationship first from the subs to their favourite shops, and then # copy that info to each SIM, maybe with some fluctuations to account # for the fact that not all shops provide topups of all operators. agents = build_ids(self.params["n_agents"], prefix="AGENT_", max_length=3) agent_df = pd.DataFrame.from_records(make_random_bipartite_data( sims.ids, agents, 0.3, seed=next(self.seeder)), columns=["SIM_ID", "AGENT"]) logging.info(" creating random sim/agent relationship ") sims_agents_rel = sims.create_relationship("POSSIBLE_AGENTS") agent_weight_gen = NumpyRandomGenerator(method="exponential", scale=1., seed=next(self.seeder)) sims_agents_rel.add_relations(from_ids=agent_df["SIM_ID"], to_ids=agent_df["AGENT"], weights=agent_weight_gen.generate( agent_df.shape[0])) return subs, sims, recharge_gen