def _default_stats(agg): if agg is None or len(agg) == 0: return OrderedDict([('mean', None), ('std', None)]) else: # Some functions may return None values # It's better to filter them agg = [x for x in agg if x is not None] return OrderedDict([('mean', mean(agg)), ('std', std(agg))])
def flatten(d, parent_key='', separator='__'): """ Flatten a nested dictionary. Parameters ---------- d: dict_like Dictionary to flatten. parent_key: string, optional Concatenated names of the parent keys. separator: string, optional Separator between the names of the each key. The default separator is '_'. Examples -------- >>> d = {'alpha': 1, 'beta': {'a': 10, 'b': 42}} >>> flatten(d) == {'alpha': 1, 'beta_a': 10, 'beta_b': 42} True >>> flatten(d, separator='.') == {'alpha': 1, 'beta.a': 10, 'beta.b': 42} True """ items = [] for k, v in d.items(): new_key = parent_key + separator + k if parent_key else k if isinstance(v, dict): items.extend(flatten(v, new_key, separator).items()) else: items.append((new_key, v)) return OrderedDict(items)
def _read_network(user, records_path, attributes_path, read_function, antennas_path=None): connections = {} correspondents = set([record.correspondent_id for record in user.records]) # Try to load all the possible correspondent files for c_id in sorted(correspondents): correspondent_file = os.path.join(records_path, c_id + ".csv") if os.path.exists(correspondent_file): correspondent_user = read_csv(c_id, records_path, antennas_path, attributes_path, describe=False, network=False) else: correspondent_user = None connections[c_id] = correspondent_user # Return the network dictionary sorted by key return OrderedDict(sorted(connections.items(), key=lambda t: t[0]))
def divide_parameters(split_week, split_day, interaction): if isinstance(interaction, str): interaction = [interaction] part_of_day = ['allday'] if split_day: part_of_day += ['day', 'night'] part_of_week = ['allweek'] if split_week: part_of_week += ['weekday', 'weekend'] if interaction: return OrderedDict([('part_of_week', part_of_week), ('part_of_day', part_of_day), ('interaction', interaction)]) else: return OrderedDict([('part_of_week', part_of_week), ('part_of_day', part_of_day)])
def filter_record(records): """ Filter records and remove items with missing or inconsistents fields Parameters ---------- records : list A list of Record objects Returns ------- records, ignored : (Record list, dict) A tuple of filtered records, and a dictionnary counting the missings fields """ scheme = { 'interaction': lambda r: r.interaction in ['call', 'text'], 'direction': lambda r: r.direction in ['in', 'out'], 'correspondent_id': lambda r: r.correspondent_id is not None, 'datetime': lambda r: isinstance(r.datetime, datetime), 'call_duration': lambda r: isinstance(r.call_duration, (int, float)) if r.interaction == 'call' else True } ignored = OrderedDict([('all', 0), ('interaction', 0), ('direction', 0), ('correspondent_id', 0), ('datetime', 0), ('call_duration', 0)]) def _filter(records): global removed for r in records: valid = True for key, test in scheme.iteritems(): if not test(r): ignored[key] += 1 valid = False # Not breaking, we count all fields with errors if valid is True: yield r else: ignored['all'] += 1 return list(_filter(records)), ignored
def all(user, groupby='week', summary='default', attributes=True, flatten=False): """ Returns a dictionary containing all bandicoot indicators for the user, as well as reporting variables. """ scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar' summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats' number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in') number_of_interactions_in.__name__ = 'number_of_interaction_in' number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out') number_of_interactions_out.__name__ = 'number_of_interaction_out' functions = [ (bc.individual.active_days, scalar_type), (bc.individual.number_of_contacts, scalar_type), (bc.individual.call_duration, summary_type), (bc.individual.percent_nocturnal, scalar_type), (bc.individual.percent_initiated_conversation, scalar_type), (bc.individual.percent_initiated_interactions, scalar_type), (bc.individual.response_delay_text, summary_type), (bc.individual.response_rate_text, scalar_type), (bc.individual.entropy_of_contacts, scalar_type), (bc.individual.balance_contacts, summary_type), (bc.individual.interactions_per_contact, summary_type), (bc.individual.interevents_time, summary_type), (bc.individual.number_of_contacts_xpercent_interactions, scalar_type), (bc.individual.number_of_contacts_xpercent_durations, scalar_type), (bc.individual.number_of_interactions, scalar_type), (number_of_interactions_in, scalar_type), (number_of_interactions_out, scalar_type), (bc.spatial.number_of_places, scalar_type), (bc.spatial.entropy_places, scalar_type), (bc.spatial.percent_at_home, scalar_type), (bc.spatial.radius_of_gyration, scalar_type), (bc.spatial.frequent_locations, scalar_type) ] groups = [[r for r in g] for g in group_records(user, groupby=groupby)] reporting = OrderedDict([ ('antennas_path', user.antennas_path), ('attributes_path', user.attributes_path), ('version', bc.__version__), ('groupby', groupby), ('start_time', user.start_time and str(user.start_time)), ('end_time', user.end_time and str(user.end_time)), ('bins', len(groups)), ('has_call', user.has_call), ('has_text', user.has_text), ('has_home', user.has_home), ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)), ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)), ('percent_outofnetwork_calls', user.percent_outofnetwork_calls), ('percent_outofnetwork_texts', user.percent_outofnetwork_texts), ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts), ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations), ]) if user.records is not None: reporting['number_of_records'] = sum(map(len, groups)) else: reporting['number_of_records'] = 0. if user.ignored_records is not None: reporting['ignored_records'] = user.ignored_records returned = OrderedDict([('name', user.name), ('reporting', reporting)]) for fun, datatype in functions: try: metric = fun(user, groupby=groupby, summary=summary, datatype=datatype) except ValueError: metric = fun(user, groupby=groupby, datatype=datatype) returned[fun.__name__] = metric if attributes and user.attributes != {}: returned['attributes'] = user.attributes if flatten is True: return globals()['flatten'](returned) return returned
def all(user, groupby='week', summary='default', network=False, split_week=False, split_day=False, attributes=True, flatten=False): """ Returns a dictionary containing all bandicoot indicators for the user, as well as reporting variables. Relevant indicators are defined in the 'individual', and 'spatial' modules. =================================== ======================================================================= Reporting variables Description =================================== ======================================================================= antennas_path path of the CSV file containing antennas locations attributes_path directory where attributes were loaded version bandicoot version groupby grouping method ('week' or None) split_week whether or not indicators are also computed for weekday and weekend split_day whether or not indicators are also computed for day and night start_time time of the first record end_time time of the last record night_start, night_end start and end time to define nights weekend days used to define the weekend (``[6, 7]`` by default, where 1 is Monday) bins number of weeks if the record are grouped has_call whether or not records include calls has_text whether or not records include texts has_home whether or not a :meth:`home location <bandicoot.core.User.recompute_home>` has been found has_network whether or not correspondents where loaded percent_records_missing_location percentage of records without location antennas_missing_locations number of antennas missing a location percent_outofnetwork_calls percentage of calls, received or emitted, made with a correspondant not loaded in the network percent_outofnetwork_texts percentage of texts with contacts not loaded in the network percent_outofnetwork_contacts percentage of contacts not loaded in the network percent_outofnetwork_call_durations percentage of minutes of calls where the contact was not loaded in the network number_of_records total number of records =================================== ======================================================================= We also include a last set of reporting variables, for the records ignored at load-time. Values can be ignored due to missing or inconsistent fields (e.g., not including a valid 'datetime' value). .. code-block:: python { 'all': 0, 'interaction': 0, 'direction': 0, 'correspondent_id': 0, 'datetime': 0, 'call_duration': 0 } with the total number of records ignored (key ``'all'``), as well as the number of records with faulty values for each columns. """ # Warn the user if they are selecting weekly and there's only one week if groupby is not None: if len(set(DATE_GROUPERS[groupby](r.datetime) for r in user.records)) <= 1: print warning_str('Grouping by week, but all data is from the same week!') scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar' summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats' number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in') number_of_interactions_in.__name__ = 'number_of_interaction_in' number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out') number_of_interactions_out.__name__ = 'number_of_interaction_out' functions = [ (bc.individual.active_days, scalar_type), (bc.individual.number_of_contacts, scalar_type), (bc.individual.call_duration, summary_type), (bc.individual.percent_nocturnal, scalar_type), (bc.individual.percent_initiated_conversations, scalar_type), (bc.individual.percent_initiated_interactions, scalar_type), (bc.individual.response_delay_text, summary_type), (bc.individual.response_rate_text, scalar_type), (bc.individual.entropy_of_contacts, scalar_type), (bc.individual.balance_of_contacts, summary_type), (bc.individual.interactions_per_contact, summary_type), (bc.individual.interevent_time, summary_type), (bc.individual.percent_pareto_interactions, scalar_type), (bc.individual.percent_pareto_durations, scalar_type), (bc.individual.number_of_interactions, scalar_type), (number_of_interactions_in, scalar_type), (number_of_interactions_out, scalar_type), (bc.spatial.number_of_antennas, scalar_type), (bc.spatial.entropy_of_antennas, scalar_type), (bc.spatial.percent_at_home, scalar_type), (bc.spatial.radius_of_gyration, scalar_type), (bc.spatial.frequent_antennas, scalar_type), (bc.spatial.churn_rate, scalar_type) ] network_functions = [ bc.network.clustering_coefficient_unweighted, bc.network.clustering_coefficient_weighted, bc.network.assortativity_attributes, bc.network.assortativity_indicators ] groups = [[r for r in g] for g in group_records(user, groupby=groupby)] reporting = OrderedDict([ ('antennas_path', user.antennas_path), ('attributes_path', user.attributes_path), ('version', bc.__version__), ('groupby', groupby), ('split_week', split_week), ('split_day', split_day), ('start_time', user.start_time and str(user.start_time)), ('end_time', user.end_time and str(user.end_time)), ('night_start', str(user.night_start)), ('night_end', str(user.night_end)), ('weekend', user.weekend), ('bins', len(groups)), ('has_call', user.has_call), ('has_text', user.has_text), ('has_home', user.has_home), ('has_network', user.has_network), ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)), ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)), ('percent_outofnetwork_calls', user.percent_outofnetwork_calls), ('percent_outofnetwork_texts', user.percent_outofnetwork_texts), ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts), ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations), ]) if user.records is not None: reporting['number_of_records'] = len(user.records) else: reporting['number_of_records'] = 0. if user.ignored_records is not None: reporting['ignored_records'] = user.ignored_records returned = OrderedDict([ ('name', user.name), ('reporting', reporting) ]) for fun, datatype in functions: try: metric = fun(user, groupby=groupby, summary=summary, datatype=datatype, split_week=split_week, split_day=split_day) except ValueError: metric = fun(user, groupby=groupby, datatype=datatype, split_week=split_week, split_day=split_day) returned[fun.__name__] = metric if network and user.has_network: for fun in network_functions: returned[fun.__name__] = fun(user) if attributes and user.attributes != {}: returned['attributes'] = user.attributes if flatten is True: return globals()['flatten'](returned) return returned
def sample_user(number_records=1482, seed=42, pct_in_network=0.8): old_state = random.getstate() try: random.seed(seed, version=1) except TypeError: random.seed(seed) towers = OrderedDict([ (704, (42.361013, -71.097868)), (705, (42.370849, -71.114613)), (706, (42.3667427, -71.1069847)), (707, (42.367589, -71.076537)), (701, (42.3555, -71.099541)), (702, (42.359039, -71.094595)), (703, (42.360481, -71.087321)) ]) towers_position = [Position(antenna=k, location=v) for k, v in towers.items()] ego_records = [random_record( position=_choice(towers_position)) for _ in range(number_records)] user, _ = bc.io.load( "sample_user", ego_records, towers, None, describe=False) # create network correspondents = set([record.correspondent_id for record in ego_records]) correspondents = sorted(list(correspondents)) correspondent_records = {} connections = {} n_in_network = int(len(correspondents) * pct_in_network) n_in_network = (n_in_network // 2) * 2 in_network_correspondents = _sample(correspondents, n_in_network) def reverse_records(records, current_owner): for r in records: r.direction = 'out' if r.direction == 'in' else 'in' r.correspondent_id = current_owner return records # set records from ego for c_id in sorted(in_network_correspondents): reciprocal_records = [ r for r in ego_records if r.correspondent_id == c_id] reciprocal_records = reverse_records( copy.deepcopy(reciprocal_records), "sample_user") correspondent_records[c_id] = reciprocal_records def generate_group_with_random_links(pct_users_in_group): n_in_group = int(len(correspondents) * pct_users_in_group) group = _sample(non_grouped_correspondents, n_in_group) networkusers_group = list() for user in group: if user in in_network_correspondents: networkusers_group.append(user) def create_pair(source): user_pair = [source, _choice(group)] if user_pair[0] in non_grouped_correspondents: non_grouped_correspondents.remove(user_pair[0]) if user_pair[1] in non_grouped_correspondents: non_grouped_correspondents.remove(user_pair[1]) extra_records = [ random_record(position=_choice(towers_position), interaction=_choice(['text', 'call', 'call']), correspondent_id=user_pair[1]) for _ in range(_randint(25, 150))] correspondent_records[user_pair[0]].extend(extra_records) if (user_pair[1] in in_network_correspondents): rr = reverse_records(copy.deepcopy(extra_records), user_pair[0]) correspondent_records[user_pair[1]].extend(rr) # create pairs of users for i in range(len(networkusers_group)): create_pair(networkusers_group[i]) if random.random() > 0.5: create_pair(networkusers_group[i]) non_grouped_correspondents = copy.deepcopy(correspondents) for i in range(4): generate_group_with_random_links(pct_users_in_group=0.4 - i * 0.1) # create user object for c_id in sorted(correspondents): if c_id in in_network_correspondents: correspondent_user, _ = bc.io.load( c_id, correspondent_records[c_id], towers, None, describe=False) else: correspondent_user = None connections[c_id] = correspondent_user # return the network dictionary sorted by key user.network = OrderedDict(sorted(connections.items(), key=lambda t: t[0])) user.recompute_missing_neighbors() random.setstate(old_state) return user
def sample_user(number_records=1482, seed=42, pct_in_network=0.8): old_state = random.getstate() try: random.seed(seed, version=1) except TypeError: random.seed(seed) towers = OrderedDict([(704, (42.361013, -71.097868)), (705, (42.370849, -71.114613)), (706, (42.3667427, -71.1069847)), (707, (42.367589, -71.076537)), (701, (42.3555, -71.099541)), (702, (42.359039, -71.094595)), (703, (42.360481, -71.087321))]) towers_position = [ Position(antenna=k, location=v) for k, v in towers.items() ] ego_records = [ random_record(position=_choice(towers_position)) for _ in range(number_records) ] user, _ = bc.io.load("sample_user", ego_records, towers, None, describe=False) # create network correspondents = set([record.correspondent_id for record in ego_records]) correspondents = sorted(list(correspondents)) correspondent_records = {} connections = {} n_in_network = int(len(correspondents) * pct_in_network) n_in_network = (n_in_network // 2) * 2 in_network_correspondents = _sample(correspondents, n_in_network) def reverse_records(records, current_owner): for r in records: r.direction = 'out' if r.direction == 'in' else 'in' r.correspondent_id = current_owner return records # set records from ego for c_id in sorted(in_network_correspondents): reciprocal_records = [ r for r in ego_records if r.correspondent_id == c_id ] reciprocal_records = reverse_records(copy.deepcopy(reciprocal_records), "sample_user") correspondent_records[c_id] = reciprocal_records def generate_group_with_random_links(pct_users_in_group): n_in_group = int(len(correspondents) * pct_users_in_group) group = _sample(non_grouped_correspondents, n_in_group) networkusers_group = list() for user in group: if user in in_network_correspondents: networkusers_group.append(user) def create_pair(source): user_pair = [source, _choice(group)] if user_pair[0] in non_grouped_correspondents: non_grouped_correspondents.remove(user_pair[0]) if user_pair[1] in non_grouped_correspondents: non_grouped_correspondents.remove(user_pair[1]) extra_records = [ random_record(position=_choice(towers_position), interaction=_choice(['text', 'call', 'call']), correspondent_id=user_pair[1]) for _ in range(_randint(25, 150)) ] correspondent_records[user_pair[0]].extend(extra_records) if (user_pair[1] in in_network_correspondents): rr = reverse_records(copy.deepcopy(extra_records), user_pair[0]) correspondent_records[user_pair[1]].extend(rr) # create pairs of users for i in range(len(networkusers_group)): create_pair(networkusers_group[i]) if random.random() > 0.5: create_pair(networkusers_group[i]) non_grouped_correspondents = copy.deepcopy(correspondents) for i in range(4): generate_group_with_random_links(pct_users_in_group=0.4 - i * 0.1) # create user object for c_id in sorted(correspondents): if c_id in in_network_correspondents: correspondent_user, _ = bc.io.load(c_id, correspondent_records[c_id], towers, None, describe=False) else: correspondent_user = None connections[c_id] = correspondent_user # return the network dictionary sorted by key user.network = OrderedDict(sorted(connections.items(), key=lambda t: t[0])) user.recompute_missing_neighbors() random.setstate(old_state) return user
def _ordereddict_product(dicts): return [ OrderedDict(zip(dicts, x)) for x in itertools.product(*dicts.values()) ]
def _stats_dict(v): rv = [(key, _default_stats([getattr(s, key, None) for s in data])) for key in v] return OrderedDict(rv)
def statistics(data, summary='default', datatype=None): """ Return statistics (mean, standard error, standard error and median, min and max) on data metrics. Examples -------- Given a list of integers or floating point numbers, ``statistics`` computes the mean and standard error of the mean, and the min and max. >>> statistics([0, 1, 2, 3]) {'mean': 1.5, 'std': 1.2910, 'min': 0, 'max': 3} Given a list of ``SummaryStats`` tuples, the function will returns the mean, standard error of the mean, min and max for each attribute of the tuples. """ def _default_stats(agg): if agg is None or len(agg) == 0: return OrderedDict([('mean', None), ('std', None)]) else: # Some functions may return None values # It's better to filter them agg = [x for x in agg if x is not None] return OrderedDict([('mean', mean(agg)), ('std', std(agg))]) def _stats_dict(v): rv = [(key, _default_stats([getattr(s, key, None) for s in data])) for key in v] return OrderedDict(rv) summary_keys = { 'default': ['mean', 'std'], 'extended': ['mean', 'std', 'median', 'skewness', 'kurtosis', 'min', 'max'] } if datatype is None: datatype = infer_type(data) if datatype == 'scalar': return data if datatype == 'summarystats': if summary is None: return data.distribution elif summary in ['default', 'extended']: rv = [(key, getattr(data, key, None)) for key in summary_keys[summary]] return OrderedDict(rv) else: raise ValueError("{} is not a valid summary type".format(summary)) if datatype == 'distribution_scalar': if summary == 'default': return _default_stats(data) elif summary is None: return data else: raise ValueError("{} is not a valid summary type".format(summary)) if datatype == 'distribution_summarystats': if summary is None: return [item.distribution for item in data] elif summary in ['extended', 'default']: return _stats_dict(summary_keys[summary]) else: raise ValueError("{} is not a valid summary type".format(summary)) raise ValueError("{} is not a valid data type.".format(datatype))
def user_data(user): """ Compute indicators and statistics used by the visualization and returns a dictionnary. """ # For the dasboard, indicators are computed on a daily basis # and by taking into account empty time windows _range = _group_range(user.records, 'day') export = OrderedDict([('name', 'me'), ('date_range', [x.strftime('%Y-%m-%d') for x in _range]), ('groupby', 'day'), ('indicators', OrderedDict()), ('agg', OrderedDict())]) class Indicator(object): def __init__(self, name, function, interaction=None, **args): self.name = name self.function = function self.interaction = interaction self.args = args I = Indicator import bandicoot.individual as iv indicators_list = [ I('nb_out_call', iv.number_of_interactions, 'call', direction='out'), I('nb_out_text', iv.number_of_interactions, 'text', direction='out'), I('nb_inc_call', iv.number_of_interactions, 'call', direction='in'), I('nb_inc_text', iv.number_of_interactions, 'text', direction='in'), I('nb_out_all', iv.number_of_interactions, 'callandtext', direction='out'), I('nb_inc_all', iv.number_of_interactions, 'callandtext', direction='in'), I('nb_all', iv.number_of_interactions, 'callandtext'), I('response_delay', iv.response_delay_text, 'callandtext'), I('response_rate', iv.response_rate_text, 'callandtext'), I('call_duration', iv.call_duration, 'call'), I('percent_initiated_interactions', iv.percent_initiated_interactions, 'call'), I('percent_initiated_conversations', iv.percent_initiated_interactions, 'callandtext'), I('active_day', iv.active_days, 'callandtext'), I('number_of_contacts', iv.number_of_contacts, 'callandtext'), I('percent_nocturnal', iv.percent_nocturnal, 'callandtext'), I('balance_of_contacts', iv.balance_of_contacts, 'callandtext', weighted=False), ] ARGS = {'groupby': 'day', 'summary': None, 'filter_empty': False} for i in indicators_list: arguments = i.args arguments.update(ARGS) rv = i.function(user, interaction=i.interaction, **arguments) export['indicators'][i.name] = rv['allweek']['allday'][i.interaction] # Format percentages from [0, 1] to [0, 100] with_percentage = [ 'percent_initiated_interactions', 'percent_nocturnal', 'response_rate', 'balance_of_contacts', 'percent_initiated_conversations' ] def apply_percent(d): if isinstance(d, list): return [apply_percent(dd) for dd in d] elif d is None: return d else: return 100. * d for i in with_percentage: export['indicators'][i] = apply_percent(export['indicators'][i]) # Day by day network def groupby_day_correspondent(r): return (r.datetime.strftime('%Y-%m-%d'), r.correspondent_id) it = itertools.groupby(user.records, groupby_day_correspondent) export['network'] = [list(key) + [len(list(value))] for key, value in it] return export