def _read_network(user, records_path, attributes_path, read_function, antennas_path=None, extension=".csv"): connections = {} correspondents = Counter([record.correspondent_id for record in user.records]) num_records_error = 0 # Try to load all the possible correspondent files for c_id, count in sorted(correspondents.items()): correspondent_file = os.path.join(records_path, c_id + extension) if os.path.exists(correspondent_file): correspondent_user = read_function(c_id, records_path, antennas_path, attributes_path, describe=False, network=False) # Look for matching record in the correspondent for our_record in filter(lambda x: x.correspondent_id == c_id, user.records): for their_record in correspondent_user.records: # See if they match if _record_match(our_record, their_record): break else: num_records_error += 1 else: correspondent_user = None connections[c_id] = correspondent_user if len(user.records) > 0 and num_records_error > 0: percent_inconsistant = num_records_error / len(user.records) print warning_str('Warning: {} records of the current user are not reciprocated ({:.2%}).'.format(num_records_error, percent_inconsistant)) # Return the network dictionary sorted by key return OrderedDict(sorted(connections.items(), key=lambda t: t[0]))
def _read_network(user, records_path, attributes_path, read_function, antennas_path=None, extension=".csv"): connections = {} correspondents = Counter([record.correspondent_id for record in user.records]) # Try to load all the possible correspondent files for c_id, count in sorted(correspondents.items()): correspondent_file = os.path.join(records_path, c_id + extension) if os.path.exists(correspondent_file): connections[c_id] = read_function(c_id, records_path, antennas_path, attributes_path, describe=False, network=False, warnings=False) else: connections[c_id] = None def _is_consistent(record): if record.correspondent_id == user.name: correspondent = user elif record.correspondent_id in connections: correspondent = connections[record.correspondent_id] else: return True # consistent by default return True if correspondent is None else record.has_match(correspondent.records) def all_user_iter(): if user.name not in connections: yield user for u in connections.values(): if u is not None: yield u # Filter records and count total number of records before/after num_total_records = sum(len(u.records) for u in all_user_iter()) for u in all_user_iter(): u.records = filter(_is_consistent, u.records) num_total_records_filtered = sum(len(u.records) for u in all_user_iter()) # Report non reciprocated records num_inconsistent_records = num_total_records - num_total_records_filtered if num_inconsistent_records > 0: percent_inconsistent = num_inconsistent_records / num_total_records print warning_str('Warning: {} records ({:.2%}) for all users in the network were not reciprocated. They have been removed.'.format(num_inconsistent_records, percent_inconsistent)) # Return the network dictionary sorted by key return OrderedDict(sorted(connections.items(), key=lambda t: t[0]))
def load(name, records, antennas, attributes=None, antennas_path=None, attributes_path=None, describe=True, warnings=False): """ Creates a new user. This function is used by read_csv, read_orange, and read_telenor. If you want to implement your own reader function, we advise you to use the load() function `load` will output warnings on the standard output if some records or antennas are missing a position. Parameters ---------- name : str The name of the user. It is stored in User.name and is useful when exporting metrics about multiple users. records: list A list or a generator of Record objects. antennas : dict A dictionary of the position for each antenna. attributes : dict A (key,value) dictionary of attributes for the current user describe : boolean If describe is True, it will print a description of the loaded user to the standard output. warnings : boolean, default True If warnings is equal to False, the function will not output the warnings on the standard output. For instance: .. code-block:: python >>> records = [Record(...),...] >>> antennas = {'A51': (37.245265, 115.803418),...} >>> attributes = {'age': 60} >>> load("Frodo", records, antennas, attributes) will returns a new User object. """ user = User() user.name = name user.antennas_path = antennas_path user.attributes_path = attributes_path user.records, ignored = filter_record(records) if ignored['all'] != 0: if warnings: print warning_str("Warning: %d record(s) were removed due to missing or incomplete fields." % ignored['all']) for k in ignored.keys(): if k != 'all' and warnings: print warning_str(" " * 9 + "%s: %i record(s) with incomplete values" % (k, ignored[k])) user.ignored_records = dict(ignored) if antennas is not None: user.antennas = antennas user.has_antennas = True if attributes is not None: user.attributes = attributes percent_missing = percent_records_missing_location(user) if percent_missing > 0: if warnings: print warning_str("Warning: {0:.2%} of the records are missing a location.".format(percent_missing)) if antennas is None and warnings: print warning_str(" No antennas file was given and records are using antennas for position") if antennas_missing_locations(user) > 0: if warnings: print warning_str("Warning: %d antenna(s) are missing a location." % antennas_missing_locations(user)) if describe is True: user.describe() return user
def all(user, groupby='week', summary='default', network=False, split_week=False, split_day=False, attributes=True, flatten=False): """ Returns a dictionary containing all bandicoot indicators for the user, as well as reporting variables. Relevant indicators are defined in the 'individual', and 'spatial' modules. =================================== ======================================================================= Reporting variables Description =================================== ======================================================================= antennas_path path of the CSV file containing antennas locations attributes_path directory where attributes were loaded version bandicoot version groupby grouping method ('week' or None) split_week whether or not indicators are also computed for weekday and weekend split_day whether or not indicators are also computed for day and night start_time time of the first record end_time time of the last record night_start, night_end start and end time to define nights weekend days used to define the weekend (``[6, 7]`` by default, where 1 is Monday) bins number of weeks if the record are grouped has_call whether or not records include calls has_text whether or not records include texts has_home whether or not a :meth:`home location <bandicoot.core.User.recompute_home>` has been found has_network whether or not correspondents where loaded percent_records_missing_location percentage of records without location antennas_missing_locations number of antennas missing a location percent_outofnetwork_calls percentage of calls, received or emitted, made with a correspondant not loaded in the network percent_outofnetwork_texts percentage of texts with contacts not loaded in the network percent_outofnetwork_contacts percentage of contacts not loaded in the network percent_outofnetwork_call_durations percentage of minutes of calls where the contact was not loaded in the network number_of_records total number of records =================================== ======================================================================= We also include a last set of reporting variables, for the records ignored at load-time. Values can be ignored due to missing or inconsistent fields (e.g., not including a valid 'datetime' value). .. code-block:: python { 'all': 0, 'interaction': 0, 'direction': 0, 'correspondent_id': 0, 'datetime': 0, 'call_duration': 0 } with the total number of records ignored (key ``'all'``), as well as the number of records with faulty values for each columns. """ # Warn the user if they are selecting weekly and there's only one week if groupby is not None: if len(set(DATE_GROUPERS[groupby](r.datetime) for r in user.records)) <= 1: print warning_str('Grouping by week, but all data is from the same week!') scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar' summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats' number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in') number_of_interactions_in.__name__ = 'number_of_interaction_in' number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out') number_of_interactions_out.__name__ = 'number_of_interaction_out' functions = [ (bc.individual.active_days, scalar_type), (bc.individual.number_of_contacts, scalar_type), (bc.individual.call_duration, summary_type), (bc.individual.percent_nocturnal, scalar_type), (bc.individual.percent_initiated_conversations, scalar_type), (bc.individual.percent_initiated_interactions, scalar_type), (bc.individual.response_delay_text, summary_type), (bc.individual.response_rate_text, scalar_type), (bc.individual.entropy_of_contacts, scalar_type), (bc.individual.balance_of_contacts, summary_type), (bc.individual.interactions_per_contact, summary_type), (bc.individual.interevent_time, summary_type), (bc.individual.percent_pareto_interactions, scalar_type), (bc.individual.percent_pareto_durations, scalar_type), (bc.individual.number_of_interactions, scalar_type), (number_of_interactions_in, scalar_type), (number_of_interactions_out, scalar_type), (bc.spatial.number_of_antennas, scalar_type), (bc.spatial.entropy_of_antennas, scalar_type), (bc.spatial.percent_at_home, scalar_type), (bc.spatial.radius_of_gyration, scalar_type), (bc.spatial.frequent_antennas, scalar_type), (bc.spatial.churn_rate, scalar_type) ] network_functions = [ bc.network.clustering_coefficient_unweighted, bc.network.clustering_coefficient_weighted, bc.network.assortativity_attributes, bc.network.assortativity_indicators ] groups = [[r for r in g] for g in group_records(user, groupby=groupby)] reporting = OrderedDict([ ('antennas_path', user.antennas_path), ('attributes_path', user.attributes_path), ('version', bc.__version__), ('groupby', groupby), ('split_week', split_week), ('split_day', split_day), ('start_time', user.start_time and str(user.start_time)), ('end_time', user.end_time and str(user.end_time)), ('night_start', str(user.night_start)), ('night_end', str(user.night_end)), ('weekend', user.weekend), ('bins', len(groups)), ('has_call', user.has_call), ('has_text', user.has_text), ('has_home', user.has_home), ('has_network', user.has_network), ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)), ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)), ('percent_outofnetwork_calls', user.percent_outofnetwork_calls), ('percent_outofnetwork_texts', user.percent_outofnetwork_texts), ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts), ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations), ]) if user.records is not None: reporting['number_of_records'] = len(user.records) else: reporting['number_of_records'] = 0. if user.ignored_records is not None: reporting['ignored_records'] = user.ignored_records returned = OrderedDict([ ('name', user.name), ('reporting', reporting) ]) for fun, datatype in functions: try: metric = fun(user, groupby=groupby, summary=summary, datatype=datatype, split_week=split_week, split_day=split_day) except ValueError: metric = fun(user, groupby=groupby, datatype=datatype, split_week=split_week, split_day=split_day) returned[fun.__name__] = metric if network and user.has_network: for fun in network_functions: returned[fun.__name__] = fun(user) if attributes and user.attributes != {}: returned['attributes'] = user.attributes if flatten is True: return globals()['flatten'](returned) return returned
def read_telenor(incoming_cdr, outgoing_cdr, cell_towers, describe=True, warnings=True): """ Load user records from a CSV file in *telenor* format, which is only applicable for call records. .. note:: read_telenor has been deprecated in bandicoot 0.4. Arguments --------- incoming_cdr: str Path to the CSV file containing incoming records, using the following scheme: :: B_PARTY,A_PARTY,DURATION,B_CELL,CALL_DATE,CALL_TIME,CALL_TYPE outgoing_cdr: str Path to the CSV file containing outgoing records, using the following scheme: :: A_NUMBER,B_NUMBER,DURATION,B_CELL,CALL_DATE,CALL_TIME,CALL_TYPE cell_towers: str Path to the CSV file containing the positions of all describe : boolean If describe is True, it will print a description of the loaded user to the standard output. """ print warning_str("read_telenor has been deprecated in bandicoot 0.4.") import itertools import csv def parse_direction(code): if code == 'MOC': return 'out' elif code == 'MTC': return 'in' else: raise NotImplementedError cells = None with open(cell_towers, 'rb') as f: cell_towers_list = csv.DictReader(f) cells = {} for line in cell_towers_list: if line['LONGITUDE'] != '' and line['LATITUDE'] != '': latlon = (float(line['LONGITUDE']), float(line['LATITUDE'])) cell_id = line['CELLID_HEX'] cells[cell_id] = latlon def parse_record(raw): direction = parse_direction(raw['CALL_TYPE'].strip()) if direction == 'in': contact = raw.get('A_PARTY', raw.get('A_NUMBER')) cell_id = raw['B_CELL'] else: contact = raw.get('B_PARTY', raw.get('B_NUMBER')) cell_id = raw['A_CELL'] position = Position(antenna=cell_id, location=cells.get(cell_id)) _date_str = raw.get('CDATE', raw.get('CALL_DATE')) _time_str = raw.get('CTIME', raw.get('CALL_TIME')) _datetime = datetime.strptime(_date_str + _time_str, "%Y%m%d%H:%M:%S") r = Record(interaction='call', direction=direction, correspondent_id=contact, call_duration=float(raw['DURATION'].strip()), datetime=_datetime, position=position) return r with open(incoming_cdr, 'rb') as f_in: incoming_ = map(parse_record, csv.DictReader(f_in)) with open(outgoing_cdr, 'rb') as f: outgoing_ = map(parse_record, csv.DictReader(f)) records = itertools.chain(incoming_, outgoing_) name = incoming_cdr user, errors = load(name, records, cells, warnings=None, describe=False) if describe: user.describe() return user
def all(user, groupby='week', summary='default', split_week=False, split_day=False, attributes=True, flatten=False): """ Returns a dictionary containing all bandicoot indicators for the user, as well as reporting variables. The reporting variables include: * the path of files containting the antennas and attributes, * the current version of bandicoot, * the *groupby* method (``'week'`` or ``None``) and the day/night, weekday/weekend filters, * the date and time for the first and last records, * the range of hours used to detect interactions at night, and the weekend range, * the number of bins if the records are grouped weekly, * the binary properties ``has_call``, ``has_text``, ``has_home``, * the percentage of records missing antennas, and antennas missing (lat, lon) locations, * the percentage of contacts not in the network, as well as interactions (for calls, texts, and call durations), * the total number of records for the user We also include a last set of reporting variables, for the records ignored at the loading, due to faulty or incorrect values: .. code-block:: python { 'all': 0, 'interaction': 0, 'direction': 0, 'correspondent_id': 0, 'datetime': 0, 'call_duration': 0 } with the total number of records ignored (key ``'all'``), as well as the number of records with faulty values for each columns. """ # Warn the user if they are selecting weekly and there's only one week if groupby == 'week': if len(set(r.datetime.isocalendar()[:2] for r in user.records)) <= 1: print warning_str('Grouping by week, but all data is from the same week!') scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar' summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats' number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in') number_of_interactions_in.__name__ = 'number_of_interaction_in' number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out') number_of_interactions_out.__name__ = 'number_of_interaction_out' functions = [ (bc.individual.active_days, scalar_type), (bc.individual.number_of_contacts, scalar_type), (bc.individual.call_duration, summary_type), (bc.individual.percent_nocturnal, scalar_type), (bc.individual.percent_initiated_conversations, scalar_type), (bc.individual.percent_initiated_interactions, scalar_type), (bc.individual.response_delay_text, summary_type), (bc.individual.response_rate_text, scalar_type), (bc.individual.entropy_of_contacts, scalar_type), (bc.individual.balance_of_contacts, summary_type), (bc.individual.interactions_per_contact, summary_type), (bc.individual.interevent_time, summary_type), (bc.individual.percent_pareto_interactions, scalar_type), (bc.individual.percent_pareto_durations, scalar_type), (bc.individual.number_of_interactions, scalar_type), (number_of_interactions_in, scalar_type), (number_of_interactions_out, scalar_type), (bc.spatial.number_of_antennas, scalar_type), (bc.spatial.entropy_of_antennas, scalar_type), (bc.spatial.percent_at_home, scalar_type), (bc.spatial.radius_of_gyration, scalar_type), (bc.spatial.frequent_antennas, scalar_type) ] groups = [[r for r in g] for g in group_records(user, groupby=groupby)] reporting = OrderedDict([ ('antennas_path', user.antennas_path), ('attributes_path', user.attributes_path), ('version', bc.__version__), ('groupby', groupby), ('split_week', split_week), ('split_day', split_day), ('start_time', user.start_time and str(user.start_time)), ('end_time', user.end_time and str(user.end_time)), ('night_start', str(user.night_start)), ('night_end', str(user.night_end)), ('weekend', user.weekend), ('bins', len(groups)), ('has_call', user.has_call), ('has_text', user.has_text), ('has_home', user.has_home), ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)), ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)), ('percent_outofnetwork_calls', user.percent_outofnetwork_calls), ('percent_outofnetwork_texts', user.percent_outofnetwork_texts), ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts), ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations), ]) if user.records is not None: reporting['number_of_records'] = len(user.records) else: reporting['number_of_records'] = 0. if user.ignored_records is not None: reporting['ignored_records'] = user.ignored_records returned = OrderedDict([ ('name', user.name), ('reporting', reporting) ]) for fun, datatype in functions: try: metric = fun(user, groupby=groupby, summary=summary, datatype=datatype, split_week=split_week, split_day=split_day) except ValueError: metric = fun(user, groupby=groupby, datatype=datatype, split_week=split_week, split_day=split_day) returned[fun.__name__] = metric if attributes and user.attributes != {}: returned['attributes'] = user.attributes if flatten is True: return globals()['flatten'](returned) return returned
def load(name, records, antennas, attributes=None, antennas_path=None, attributes_path=None, describe=True, warnings=False): """ Creates a new user. This function is used by read_csv, read_orange, and read_telenor. If you want to implement your own reader function, we advise you to use the load() function `load` will output warnings on the standard output if some records or antennas are missing a position. Parameters ---------- name : str The name of the user. It is stored in User.name and is useful when exporting metrics about multiple users. records: list A list or a generator of Record objects. antennas : dict A dictionary of the position for each antenna. attributes : dict A (key,value) dictionary of attributes for the current user describe : boolean If describe is True, it will print a description of the loaded user to the standard output. warnings : boolean, default True If warnings is equal to False, the function will not output the warnings on the standard output. For instance: .. code-block:: python >>> records = [Record(...),...] >>> antennas = {'A51': (37.245265, 115.803418),...} >>> attributes = {'age': 60} >>> load("Frodo", records, antennas, attributes) will returns a new User object. """ user = User() user.name = name user.antennas_path = antennas_path user.attributes_path = attributes_path user.records, ignored = filter_record(records) if ignored['all'] != 0: if warnings: print warning_str( "Warning: %d record(s) were removed due to missing or incomplete fields." % ignored['all']) for k in ignored.keys(): if k != 'all' and warnings: print warning_str(" " * 9 + "%s: %i record(s) with incomplete values" % (k, ignored[k])) user.ignored_records = dict(ignored) if antennas is not None: user.antennas = antennas user.has_antennas = True if attributes is not None: user.attributes = attributes percent_missing = percent_records_missing_location(user) if percent_missing > 0: if warnings: print warning_str( "Warning: {0:.2%} of the records are missing a location.". format(percent_missing)) if antennas is None and warnings: print warning_str( " No antennas file was given and records are using antennas for position" ) if antennas_missing_locations(user) > 0: if warnings: print warning_str( "Warning: %d antenna(s) are missing a location." % antennas_missing_locations(user)) if describe is True: user.describe() return user