Esempio n. 1
0
def _read_network(user, records_path, attributes_path, read_function, antennas_path=None, extension=".csv"):
    connections = {}
    correspondents = Counter([record.correspondent_id for record in user.records])

    num_records_error = 0

    # Try to load all the possible correspondent files
    for c_id, count in sorted(correspondents.items()):
        correspondent_file = os.path.join(records_path, c_id + extension)
        if os.path.exists(correspondent_file):
            correspondent_user = read_function(c_id, records_path, antennas_path, attributes_path, describe=False, network=False)

            # Look for matching record in the correspondent
            for our_record in filter(lambda x: x.correspondent_id == c_id, user.records):
                for their_record in correspondent_user.records:
                    # See if they match
                    if _record_match(our_record, their_record):
                        break
                else:
                    num_records_error += 1

        else:
            correspondent_user = None

        connections[c_id] = correspondent_user

    if len(user.records) > 0 and num_records_error > 0:
        percent_inconsistant = num_records_error / len(user.records)
        print warning_str('Warning: {} records of the current user are not reciprocated ({:.2%}).'.format(num_records_error, percent_inconsistant))

    # Return the network dictionary sorted by key
    return OrderedDict(sorted(connections.items(), key=lambda t: t[0]))
Esempio n. 2
0
def _read_network(user, records_path, attributes_path, read_function, antennas_path=None, extension=".csv"):
    connections = {}
    correspondents = Counter([record.correspondent_id for record in user.records])

    # Try to load all the possible correspondent files
    for c_id, count in sorted(correspondents.items()):
        correspondent_file = os.path.join(records_path, c_id + extension)
        if os.path.exists(correspondent_file):
            connections[c_id] = read_function(c_id, records_path, antennas_path, attributes_path, describe=False, network=False, warnings=False)
        else:
            connections[c_id] = None

    def _is_consistent(record):
        if record.correspondent_id == user.name:
            correspondent = user
        elif record.correspondent_id in connections:
            correspondent = connections[record.correspondent_id]
        else:
            return True  # consistent by default

        return True if correspondent is None else record.has_match(correspondent.records)

    def all_user_iter():
        if user.name not in connections:
            yield user

        for u in connections.values():
            if u is not None:
                yield u

    # Filter records and count total number of records before/after
    num_total_records = sum(len(u.records) for u in all_user_iter())
    for u in all_user_iter():
        u.records = filter(_is_consistent, u.records)
    num_total_records_filtered = sum(len(u.records) for u in all_user_iter())

    # Report non reciprocated records
    num_inconsistent_records = num_total_records - num_total_records_filtered
    if num_inconsistent_records > 0:
        percent_inconsistent = num_inconsistent_records / num_total_records
        print warning_str('Warning: {} records ({:.2%}) for all users in the network were not reciprocated. They have been removed.'.format(num_inconsistent_records, percent_inconsistent))

    # Return the network dictionary sorted by key
    return OrderedDict(sorted(connections.items(), key=lambda t: t[0]))
Esempio n. 3
0
def load(name, records, antennas, attributes=None, antennas_path=None,
         attributes_path=None, describe=True, warnings=False):
    """
    Creates a new user. This function is used by read_csv, read_orange,
    and read_telenor. If you want to implement your own reader function, we advise you to use the load() function

    `load` will output warnings on the standard output if some records or
    antennas are missing a position.

    Parameters
    ----------

    name : str
        The name of the user. It is stored in User.name and is useful when
        exporting metrics about multiple users.

    records: list
        A list or a generator of Record objects.

    antennas : dict
        A dictionary of the position for each antenna.

    attributes : dict
        A (key,value) dictionary of attributes for the current user

    describe : boolean
        If describe is True, it will print a description of the loaded user
        to the standard output.

    warnings : boolean, default True
        If warnings is equal to False, the function will not output the
        warnings on the standard output.


    For instance:

    .. code-block:: python

       >>> records = [Record(...),...]
       >>> antennas = {'A51': (37.245265, 115.803418),...}
       >>> attributes = {'age': 60}
       >>> load("Frodo", records, antennas, attributes)

    will returns a new User object.


    """

    user = User()
    user.name = name
    user.antennas_path = antennas_path
    user.attributes_path = attributes_path

    user.records, ignored = filter_record(records)

    if ignored['all'] != 0:
        if warnings:
            print warning_str("Warning: %d record(s) were removed due to missing or incomplete fields." % ignored['all'])
        for k in ignored.keys():
            if k != 'all' and warnings:
                print warning_str(" " * 9 + "%s: %i record(s) with incomplete values" % (k, ignored[k]))

    user.ignored_records = dict(ignored)

    if antennas is not None:
        user.antennas = antennas
        user.has_antennas = True
    if attributes is not None:
        user.attributes = attributes

    percent_missing = percent_records_missing_location(user)
    if percent_missing > 0:
        if warnings:
            print warning_str("Warning: {0:.2%} of the records are missing a location.".format(percent_missing))
        if antennas is None and warnings:
            print warning_str("         No antennas file was given and records are using antennas for position")

    if antennas_missing_locations(user) > 0:
        if warnings:
            print warning_str("Warning: %d antenna(s) are missing a location." % antennas_missing_locations(user))

    if describe is True:
        user.describe()

    return user
Esempio n. 4
0
def all(user, groupby='week', summary='default', network=False, split_week=False, split_day=False, attributes=True, flatten=False):
    """
    Returns a dictionary containing all bandicoot indicators for the user,
    as well as reporting variables.

    Relevant indicators are defined in the 'individual', and 'spatial' modules.

    =================================== =======================================================================
    Reporting variables                 Description
    =================================== =======================================================================
    antennas_path                       path of the CSV file containing antennas locations
    attributes_path                     directory where attributes were loaded
    version                             bandicoot version
    groupby                             grouping method ('week' or None)
    split_week                          whether or not indicators are also computed for weekday and weekend
    split_day                           whether or not indicators are also computed for day and night
    start_time                          time of the first record
    end_time                            time of the last record
    night_start, night_end              start and end time to define nights
    weekend                             days used to define the weekend (``[6, 7]`` by default, where 1 is Monday)
    bins                                number of weeks if the record are grouped
    has_call                            whether or not records include calls
    has_text                            whether or not records include texts
    has_home                            whether or not a :meth:`home location <bandicoot.core.User.recompute_home>` has been found
    has_network                         whether or not correspondents where loaded
    percent_records_missing_location    percentage of records without location
    antennas_missing_locations          number of antennas missing a location
    percent_outofnetwork_calls          percentage of calls, received or emitted, made with a correspondant not loaded in the network
    percent_outofnetwork_texts          percentage of texts with contacts not loaded in the network
    percent_outofnetwork_contacts       percentage of contacts not loaded in the network
    percent_outofnetwork_call_durations percentage of minutes of calls where the contact was not loaded in the network
    number_of_records                   total number of records
    =================================== =======================================================================

    We also include a last set of reporting variables, for the records ignored
    at load-time. Values can be ignored due to missing or inconsistent fields  
    (e.g., not including a valid 'datetime' value).  

    .. code-block:: python

        {
            'all': 0,
            'interaction': 0,
            'direction': 0,
            'correspondent_id': 0,
            'datetime': 0,
            'call_duration': 0
        }

    with the total number of records ignored (key ``'all'``), as well as the
    number of records with faulty values for each columns.
    """

    # Warn the user if they are selecting weekly and there's only one week
    if groupby is not None:
        if len(set(DATE_GROUPERS[groupby](r.datetime) for r in user.records)) <= 1:
            print warning_str('Grouping by week, but all data is from the same week!')
    scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar'
    summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats'

    number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in')
    number_of_interactions_in.__name__ = 'number_of_interaction_in'
    number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out')
    number_of_interactions_out.__name__ = 'number_of_interaction_out'

    functions = [
        (bc.individual.active_days, scalar_type),
        (bc.individual.number_of_contacts, scalar_type),
        (bc.individual.call_duration, summary_type),
        (bc.individual.percent_nocturnal, scalar_type),
        (bc.individual.percent_initiated_conversations, scalar_type),
        (bc.individual.percent_initiated_interactions, scalar_type),
        (bc.individual.response_delay_text, summary_type),
        (bc.individual.response_rate_text, scalar_type),
        (bc.individual.entropy_of_contacts, scalar_type),
        (bc.individual.balance_of_contacts, summary_type),
        (bc.individual.interactions_per_contact, summary_type),
        (bc.individual.interevent_time, summary_type),
        (bc.individual.percent_pareto_interactions, scalar_type),
        (bc.individual.percent_pareto_durations, scalar_type),
        (bc.individual.number_of_interactions, scalar_type),
        (number_of_interactions_in, scalar_type),
        (number_of_interactions_out, scalar_type),
        (bc.spatial.number_of_antennas, scalar_type),
        (bc.spatial.entropy_of_antennas, scalar_type),
        (bc.spatial.percent_at_home, scalar_type),
        (bc.spatial.radius_of_gyration, scalar_type),
        (bc.spatial.frequent_antennas, scalar_type),
        (bc.spatial.churn_rate, scalar_type)
    ]

    network_functions = [
        bc.network.clustering_coefficient_unweighted,
        bc.network.clustering_coefficient_weighted,
        bc.network.assortativity_attributes,
        bc.network.assortativity_indicators
    ]

    groups = [[r for r in g] for g in group_records(user, groupby=groupby)]

    reporting = OrderedDict([
        ('antennas_path', user.antennas_path),
        ('attributes_path', user.attributes_path),
        ('version', bc.__version__),
        ('groupby', groupby),
        ('split_week', split_week),
        ('split_day', split_day),
        ('start_time', user.start_time and str(user.start_time)),
        ('end_time', user.end_time and str(user.end_time)),
        ('night_start', str(user.night_start)),
        ('night_end', str(user.night_end)),
        ('weekend', user.weekend),
        ('bins', len(groups)),
        ('has_call', user.has_call),
        ('has_text', user.has_text),
        ('has_home', user.has_home),
        ('has_network', user.has_network),
        ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)),
        ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)),
        ('percent_outofnetwork_calls', user.percent_outofnetwork_calls),
        ('percent_outofnetwork_texts', user.percent_outofnetwork_texts),
        ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts),
        ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations),
    ])

    if user.records is not None:
        reporting['number_of_records'] = len(user.records)
    else:
        reporting['number_of_records'] = 0.

    if user.ignored_records is not None:
        reporting['ignored_records'] = user.ignored_records

    returned = OrderedDict([
        ('name', user.name),
        ('reporting', reporting)
    ])

    for fun, datatype in functions:
        try:
            metric = fun(user, groupby=groupby, summary=summary, datatype=datatype, split_week=split_week, split_day=split_day)
        except ValueError:
            metric = fun(user, groupby=groupby, datatype=datatype, split_week=split_week, split_day=split_day)

        returned[fun.__name__] = metric

    if network and user.has_network:
        for fun in network_functions:
            returned[fun.__name__] = fun(user)

    if attributes and user.attributes != {}:
        returned['attributes'] = user.attributes

    if flatten is True:
        return globals()['flatten'](returned)

    return returned
Esempio n. 5
0
def read_telenor(incoming_cdr, outgoing_cdr, cell_towers, describe=True, warnings=True):
    """
    Load user records from a CSV file in *telenor* format, which is only applicable for call records.

        .. note:: read_telenor has been deprecated in bandicoot 0.4.


    Arguments
    ---------

    incoming_cdr: str
        Path to the CSV file containing incoming records, using the following
        scheme: ::

             B_PARTY,A_PARTY,DURATION,B_CELL,CALL_DATE,CALL_TIME,CALL_TYPE

    outgoing_cdr: str
        Path to the CSV file containing outgoing records, using the following
        scheme: ::

             A_NUMBER,B_NUMBER,DURATION,B_CELL,CALL_DATE,CALL_TIME,CALL_TYPE

    cell_towers: str
        Path to the CSV file containing the positions of all

    describe : boolean
        If describe is True, it will print a description of the loaded user to the standard output.

    """

    print warning_str("read_telenor has been deprecated in bandicoot 0.4.")

    import itertools
    import csv

    def parse_direction(code):
        if code == 'MOC':
            return 'out'
        elif code == 'MTC':
            return 'in'
        else:
            raise NotImplementedError

    cells = None
    with open(cell_towers, 'rb') as f:
        cell_towers_list = csv.DictReader(f)
        cells = {}
        for line in cell_towers_list:
            if line['LONGITUDE'] != '' and line['LATITUDE'] != '':
                latlon = (float(line['LONGITUDE']), float(line['LATITUDE']))
                cell_id = line['CELLID_HEX']
                cells[cell_id] = latlon

    def parse_record(raw):
        direction = parse_direction(raw['CALL_TYPE'].strip())

        if direction == 'in':
            contact = raw.get('A_PARTY', raw.get('A_NUMBER'))
            cell_id = raw['B_CELL']
        else:
            contact = raw.get('B_PARTY', raw.get('B_NUMBER'))
            cell_id = raw['A_CELL']

        position = Position(antenna=cell_id, location=cells.get(cell_id))

        _date_str = raw.get('CDATE', raw.get('CALL_DATE'))
        _time_str = raw.get('CTIME', raw.get('CALL_TIME'))
        _datetime = datetime.strptime(_date_str + _time_str,
                                      "%Y%m%d%H:%M:%S")

        r = Record(interaction='call',
                   direction=direction,
                   correspondent_id=contact,
                   call_duration=float(raw['DURATION'].strip()),
                   datetime=_datetime,
                   position=position)

        return r

    with open(incoming_cdr, 'rb') as f_in:
        incoming_ = map(parse_record, csv.DictReader(f_in))

        with open(outgoing_cdr, 'rb') as f:
            outgoing_ = map(parse_record, csv.DictReader(f))

            records = itertools.chain(incoming_, outgoing_)

    name = incoming_cdr

    user, errors = load(name, records, cells, warnings=None, describe=False)

    if describe:
        user.describe()

    return user
Esempio n. 6
0
def all(user, groupby='week', summary='default', split_week=False, split_day=False, attributes=True, flatten=False):
    """
    Returns a dictionary containing all bandicoot indicators for the user,
    as well as reporting variables.

    The reporting variables include:

    * the path of files containting the antennas and attributes,
    * the current version of bandicoot,
    * the *groupby* method (``'week'`` or ``None``) and the day/night, weekday/weekend filters,
    * the date and time for the first and last records,
    * the range of hours used to detect interactions at night, and the weekend range,
    * the number of bins if the records are grouped weekly,
    * the binary properties ``has_call``, ``has_text``, ``has_home``,
    * the percentage of records missing antennas, and antennas missing (lat, lon) locations,
    * the percentage of contacts not in the network, as well as interactions (for calls, texts, and call durations),
    * the total number of records for the user

    We also include a last set of reporting variables, for the records ignored
    at the loading, due to faulty or incorrect values:

    .. code-block:: python

        {
            'all': 0,
            'interaction': 0,
            'direction': 0,
            'correspondent_id': 0,
            'datetime': 0,
            'call_duration': 0
        }

    with the total number of records ignored (key ``'all'``), as well as the
    number of records with faulty values for each columns.
    """

    # Warn the user if they are selecting weekly and there's only one week
    if groupby == 'week':
        if len(set(r.datetime.isocalendar()[:2] for r in user.records)) <= 1:
            print warning_str('Grouping by week, but all data is from the same week!')

    scalar_type = 'distribution_scalar' if groupby == 'week' else 'scalar'
    summary_type = 'distribution_summarystats' if groupby == 'week' else 'summarystats'

    number_of_interactions_in = partial(bc.individual.number_of_interactions, direction='in')
    number_of_interactions_in.__name__ = 'number_of_interaction_in'
    number_of_interactions_out = partial(bc.individual.number_of_interactions, direction='out')
    number_of_interactions_out.__name__ = 'number_of_interaction_out'

    functions = [
        (bc.individual.active_days, scalar_type),
        (bc.individual.number_of_contacts, scalar_type),
        (bc.individual.call_duration, summary_type),
        (bc.individual.percent_nocturnal, scalar_type),
        (bc.individual.percent_initiated_conversations, scalar_type),
        (bc.individual.percent_initiated_interactions, scalar_type),
        (bc.individual.response_delay_text, summary_type),
        (bc.individual.response_rate_text, scalar_type),
        (bc.individual.entropy_of_contacts, scalar_type),
        (bc.individual.balance_of_contacts, summary_type),
        (bc.individual.interactions_per_contact, summary_type),
        (bc.individual.interevent_time, summary_type),
        (bc.individual.percent_pareto_interactions, scalar_type),
        (bc.individual.percent_pareto_durations, scalar_type),
        (bc.individual.number_of_interactions, scalar_type),
        (number_of_interactions_in, scalar_type),
        (number_of_interactions_out, scalar_type),
        (bc.spatial.number_of_antennas, scalar_type),
        (bc.spatial.entropy_of_antennas, scalar_type),
        (bc.spatial.percent_at_home, scalar_type),
        (bc.spatial.radius_of_gyration, scalar_type),
        (bc.spatial.frequent_antennas, scalar_type)
    ]

    groups = [[r for r in g] for g in group_records(user, groupby=groupby)]

    reporting = OrderedDict([
        ('antennas_path', user.antennas_path),
        ('attributes_path', user.attributes_path),
        ('version', bc.__version__),
        ('groupby', groupby),
        ('split_week', split_week),
        ('split_day', split_day),
        ('start_time', user.start_time and str(user.start_time)),
        ('end_time', user.end_time and str(user.end_time)),
        ('night_start', str(user.night_start)),
        ('night_end', str(user.night_end)),
        ('weekend', user.weekend),
        ('bins', len(groups)),
        ('has_call', user.has_call),
        ('has_text', user.has_text),
        ('has_home', user.has_home),
        ('percent_records_missing_location', bc.helper.tools.percent_records_missing_location(user)),
        ('antennas_missing_locations', bc.helper.tools.antennas_missing_locations(user)),
        ('percent_outofnetwork_calls', user.percent_outofnetwork_calls),
        ('percent_outofnetwork_texts', user.percent_outofnetwork_texts),
        ('percent_outofnetwork_contacts', user.percent_outofnetwork_contacts),
        ('percent_outofnetwork_call_durations', user.percent_outofnetwork_call_durations),
    ])

    if user.records is not None:
        reporting['number_of_records'] = len(user.records)
    else:
        reporting['number_of_records'] = 0.

    if user.ignored_records is not None:
        reporting['ignored_records'] = user.ignored_records

    returned = OrderedDict([
        ('name', user.name),
        ('reporting', reporting)
    ])

    for fun, datatype in functions:
        try:
            metric = fun(user, groupby=groupby, summary=summary, datatype=datatype, split_week=split_week, split_day=split_day)
        except ValueError:
            metric = fun(user, groupby=groupby, datatype=datatype, split_week=split_week, split_day=split_day)

        returned[fun.__name__] = metric

    if attributes and user.attributes != {}:
        returned['attributes'] = user.attributes

    if flatten is True:
        return globals()['flatten'](returned)

    return returned
Esempio n. 7
0
def load(name,
         records,
         antennas,
         attributes=None,
         antennas_path=None,
         attributes_path=None,
         describe=True,
         warnings=False):
    """
    Creates a new user. This function is used by read_csv, read_orange,
    and read_telenor. If you want to implement your own reader function, we advise you to use the load() function

    `load` will output warnings on the standard output if some records or
    antennas are missing a position.
    
    
    Parameters
    ----------

    name : str
        The name of the user. It is stored in User.name and is useful when
        exporting metrics about multiple users.

    records: list
        A list or a generator of Record objects.

    antennas : dict
        A dictionary of the position for each antenna.

    attributes : dict
        A (key,value) dictionary of attributes for the current user

    describe : boolean
        If describe is True, it will print a description of the loaded user to the
        standard output.

    warnings : boolean, default True
        If warnings is equal to False, the function will not output the warnings on
        the standard output.


    For instance:

    .. code-block:: python

       >>> records = [Record(...),...]
       >>> antennas = {'A51': (37.245265, 115.803418),...}
       >>> attributes = {'age': 60}
       >>> load("Frodo", records, antennas, attributes)

    will returns a new User object.


    """

    user = User()
    user.name = name
    user.antennas_path = antennas_path
    user.attributes_path = attributes_path

    user.records, ignored = filter_record(records)

    if ignored['all'] != 0:
        if warnings:
            print warning_str(
                "Warning: %d record(s) were removed due to missing or incomplete fields."
                % ignored['all'])
        for k in ignored.keys():
            if k != 'all' and warnings:
                print warning_str(" " * 9 +
                                  "%s: %i record(s) with incomplete values" %
                                  (k, ignored[k]))

    user.ignored_records = dict(ignored)

    if antennas is not None:
        user.antennas = antennas
        user.has_antennas = True
    if attributes is not None:
        user.attributes = attributes

    percent_missing = percent_records_missing_location(user)
    if percent_missing > 0:
        if warnings:
            print warning_str(
                "Warning: {0:.2%} of the records are missing a location.".
                format(percent_missing))
        if antennas is None and warnings:
            print warning_str(
                "         No antennas file was given and records are using antennas for position"
            )

    if antennas_missing_locations(user) > 0:
        if warnings:
            print warning_str(
                "Warning: %d antenna(s) are missing a location." %
                antennas_missing_locations(user))

    if describe is True:
        user.describe()

    return user