Example #1
0
def arrange_data(data, pop, periods=1):
    """Arrange the data in a dict: works also for data with higher frequency
    than yearly.
    
    """    
    
    #fill notification rates, population and percentual change
    notif = []
    rel_change = [float('NaN')]
    
    times = data['years'] # in format (year, period)
    for i in range(len(times)):
        notif.append(data['abs_notif'][i] * 10.0**5 / pop[times[i][0]])

    for i in range(1, len(times)):
        # Add 1 in the denominator to avoid division by 0 
        # Check if two points are successive 
        if (times[i-1][0] == times[i][0] and times[i-1][1] == times[i][1] -1):
            rel_change.append((notif[i]-notif[i-1])/float(1 + notif[i-1]))
        elif times[i-1][0] == times[i][0] - 1 and times[i-1][1] == periods and times[i][1] ==1:
            rel_change.append((notif[i]-notif[i-1])/float(1 + notif[i-1]))
        else:
            rel_change.append(float('NaN'))

    #list years with missing data
    missing, missing_intervals = ut._missing(times, periods)
    
    #interpolate between missing
    interpolated = _interpolate(times, notif, missing, missing_intervals, 
            periods)    

    #define data as a dict of dicts 
    new_data = {'abs_notif': data['abs_notif'], 'notif': notif, 
        'rel_change': rel_change, 'interpolated':interpolated, 'times': times,
        'years': data['years']}

    return new_data
Example #2
0
def _select_data(first, data, periods=1, outliers=None, threshold=5, plot=False):
    """Cuts the data to have only the data used for projection

       Input: first: int, starting year
              data: dict, the 'data' dictionary of extract_data()
              outliers: list of int, years considered outliers
              periods: int, data frequency: 1 for years, 4 for quarters...
              threshold: int, minimum number of points required for projection
                    (excluding outliers)
              plot: boolean, whether we want to produce a plot

       Output: enough_data: boolean, whether enough data selected
               new_data: dict, containing selected data
               figdata: string for web plotting
    
    """
    new_data = {}   

    # If there are outliers
    if outliers:
        notif_new = []
        abs_notif_new = []
        years_new = []
        changes_new = []
        for i in range(len(data['times'])):
            # rewrite lists excluding outliers
            if data['times'][i] not in outliers:
                years_new.append(data['times'][i])
                notif_new.append(data['notif'][i])
                abs_notif_new.append(data['abs_notif'][i])
                changes_new.append(data['rel_change'][i])
        # re-run interpolation to accomodate for outliers
        missing, missing_int = ut._missing(years_new, periods)
        new_data['interpolated'] = _interpolate(years_new, notif_new,
                missing, missing_int, periods)

    # If no outliers
    else:
        # copy original data
        notif_new = data['notif']
        abs_notif_new = data['abs_notif']
        years_new = data['times']
        changes_new = data['rel_change']
        new_data['interpolated'] = data['interpolated']
    
    # Find index for firs year
    for i in range(len(years_new)):
        if years_new[i] == first:
            first_i = i
            break

    # Cut data lists starting from first year
    new_data['notif'] = notif_new[first_i:]
    new_data['abs_notif'] = abs_notif_new[first_i:]
    new_data['times'] = years_new[first_i:]
    new_data['rel_change'] = changes_new[first_i:]

    # Check if enough data is available
    if len(new_data['notif']) < threshold: 
        enough_data = False
    else:
        enough_data = True
    
    # Plot
    if plot == True:
        plt.clf()
        figdata = _plot_time_series(new_data)
    else:
        figdata = None
        
    return enough_data, new_data, figdata