Ejemplo n.º 1
0
def top_bgp_data(flows):
    "Return top AS of GVB flow recarray in the form of a dictonnary."
    resume = {}
    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    flows_down_as = aggregate.aggregate(flows_down, 'asBGP', 'l3Bytes', sum)
    flows_down_as.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_%d' % i], resume['vol_as_down_%d' % i]) = flows_down_as[-(i+1)]
    resume['total_other_down_as'] = np.sum(flows_down_as[:-10][:].aggregation)

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB)
    flows_down_as_web = aggregate.aggregate(flows_down_web, 'asBGP', 'l3Bytes', sum)
    flows_down_as_web.sort(order='aggregation') 
    for i in range(11):
        (resume['name_as_down_web_%d' % i], resume['vol_as_down_web_%d' % i]) = flows_down_as_web[-(i+1)]
    resume['total_other_as_down_web'] = np.sum(flows_down_as_web[:-10][:].aggregation)

    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM)
    flows_down_as_other_stream = aggregate.aggregate(flows_down_other_stream, 'asBGP', 'l3Bytes', sum)
    flows_down_as_other_stream.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_other_stream_%d' % i], resume['vol_as_down_other_stream_%d' % i]) \
            = flows_down_as_other_stream[-(i+1)]
    resume['total_other_as_down_other_stream'] = np.sum(flows_down_as_other_stream[:-10][:].aggregation)

    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM)
    flows_down_as_http_stream = aggregate.aggregate(flows_down_http_stream, 'asBGP', 'l3Bytes', sum)
    flows_down_as_http_stream.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_http_stream_%d' % i], resume['vol_as_down_http_stream_%d' % i]) \
            = flows_down_as_http_stream[-(i+1)]
    resume['total_other_as_down_http_stream'] = np.sum(flows_down_as_http_stream[:-10][:].aggregation)

    return resume 
Ejemplo n.º 2
0
def fetch_data(flows):
    "Return a resume of GVB flow recarray in the form of a dictonnary."
    resume = {}
    vol_dir = aggregate.aggregate(flows, 'direction', 'l3Bytes', sum)
    resume['vol_up'] = vol_dir[0][1]
    resume['vol_down'] = vol_dir[1][1]
    resume['vol_tot'] = resume['vol_down'] + resume['vol_up']

    vol_dscp = aggregate.aggregate(flows, 'dscp', 'l3Bytes', sum)
    resume['vol_down_web'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_WEB)
    resume['vol_down_http_stream'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['vol_down_other_stream'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)

    #to check nb of flow value
#    nb_flows_dir = aggregate.aggregate(flows, 'direction', 'client_id', len)
#    nb_flows_up = list(nb_flows_dir[0])[1]
#    nb_flows_down = list(nb_flows_dir[1])[1]
    
    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    resume['nb_down_flows_tot'] = np.shape(flows_down)[0]

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB )
    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM )
    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM )
    resume['nb_down_flows_web'] = np.shape(flows_down_web)[0]
    resume['nb_down_flows_http_stream'] = np.shape(flows_down_http_stream)[0]
    resume['nb_down_flows_other_stream'] = np.shape(flows_down_other_stream)[0]

    resume['nb_clients_tot'] = np.shape(np.unique(flows_down.client_id))[0]
    resume['nb_clients_web'] = np.shape(np.unique(flows_down_web.client_id))[0]
    resume['nb_clients_http_stream'] = np.shape(np.unique(flows_down_http_stream.client_id))[0]
    resume['nb_clients_other_stream'] = np.shape(np.unique(flows_down_other_stream.client_id))[0]

    flows_down_1MB = flows_down.compress(flows_down.l3Bytes > 10**6 )
    flows_down_1MB_dscp = aggregate.aggregate(flows_down_1MB, 'dscp', 'l3Bytes', len)

    flows_down_1MB_web = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                 == INDEX_VALUES.DSCP_WEB )
    flows_down_1MB_http_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                         == INDEX_VALUES.DSCP_HTTP_STREAM )
    flows_down_1MB_other_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                          == INDEX_VALUES.DSCP_OTHER_STREAM )

    resume['nb_clients_1MB_tot'] = np.shape(np.unique(flows_down_1MB.client_id))[0]
    resume['nb_clients_1MB_web'] = np.shape(np.unique(flows_down_1MB_web.client_id))[0]
    resume['nb_clients_1MB_http_stream'] = np.shape(np.unique(flows_down_1MB_http_stream.client_id))[0]
    resume['nb_clients_1MB_other_stream'] = np.shape(np.unique(flows_down_1MB_other_stream.client_id))[0]

    resume['nb_down_flows_1MB_tot'] = np.shape(flows_down_1MB)[0]
    resume['nb_down_flows_1MB_web'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                               'dscp', INDEX_VALUES.DSCP_WEB)
    resume['nb_down_flows_1MB_http_stream'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                       'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['nb_down_flows_1MB_other_stream'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                        'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)
    
    return resume 
Ejemplo n.º 3
0
def process(input_file, output_name, var_map, calc=None, agg_areas=True):

    def _add_pct(data_frame):

        var_list = data_frame.columns.tolist()
        for var in GEO_COLUMNS + ['area']:
            if var in var_list:
                var_list.remove(var)
        
        return pct.add_percentages(data_frame, var_list, var_list[0])

    def _export(data_frame, suffix, include_index=False):

        full_name = output_name + '_' + suffix + '.csv'
        data_frame.to_csv(full_name, index=include_index)
        print('Saved file: ' + full_name)

        return

    # Clean municipality data
    data = cd.clean_data(input_file)
    data_new = data[GEO_COLUMNS + sorted(var_map.keys())]
    data_new = data_new.rename(columns=var_map)

    # Perform any extra necessary calculations
    if calc:
        data_new = calc(data_new)

    # Aggregate
    if agg_areas:
        data_agg = agg.aggregate(data_new)
        data_ri = agg.aggregate(data_new, agg_var=(lambda x: True))

    # Calculate percentages
    data_new_w_pct = _add_pct(data_new)
    if agg_areas:
        data_agg_w_pct = _add_pct(data_agg)
        data_ri_w_pct = _add_pct(data_ri.drop('area', axis=1))

    # Export to CSV
    _export(data_new_w_pct, 'munis')
    if agg_areas:
        _export(data_agg_w_pct, 'areas', include_index=True)
        _export(data_ri_w_pct, 'state')
        return (data_new_w_pct, data_agg_w_pct, data_ri_w_pct)
    else:
        return (data_new_w_pct,)
Ejemplo n.º 4
0
def evaluate():
    if args.evaluate == "slurm" and args.evaluate_checkpoint is None:
        ckpt = "./checkpoint-%d.p" % model.iteration
        save(ckpt)
        slurmcmd = "bash -c 'p=$(./slurmparams) sbatch $p --time=240 -J evaluate -o evaluate_" + str(
            model.iteration) + ".out om-run python " + " ".join(
                sys.argv) + " --evaluate-checkpoint=" + ckpt + "'"
        print(slurmcmd)
        os.system(slurmcmd)
    else:
        M.evaluate(model)
        evaluate_start = time.time()
        elbo, kl = getELBo()
        print("elbo:", elbo)
        print("KL:", kl)
        n_classification_samples = args.classification_samples
        classification_20_way, predictive = getClassification(
            20, n_classification_samples)
        print("20-way Accuracy:",
              "%5.2f" % (classification_20_way * 100) + "%",
              flush=True)
        precision_20_way = math.sqrt(classification_20_way *
                                     (1 - classification_20_way) /
                                     n_classification_samples)
        #classification_100_way, _ = getClassification(100, n_classification_samples)
        #print("100-way Accuracy:", "%5.2f" % (classification_100_way*100) + "%", flush=True)
        #precision_100_way = math.sqrt(classification_100_way * (1-classification_100_way) / n_classification_samples)
        print("Evaluate took a total of:",
              int((time.time() - evaluate_start) / 60), "minutes")
        ev = {
            '20-way': classification_20_way,
            #'100-way':classification_100_way,
            'precision-20-way': precision_20_way,
            #'precision-100-way':precision_100_way,
            #'predictive':predictive,
            'ELBo': elbo,
            'kl': kl,
            'time': model.wallclock,
            'iteration': model.iteration
        }
        with open("evaluate_%d.p" % model.iteration, "wb") as f:
            pickle.dump(ev, f)
        aggregate()
        #model.history.append(ev)
        return ev
Ejemplo n.º 5
0
    def aggregate(self):
        '''
        Aggregate/merge individual sample GTF files
        '''
        r = self.results
        a = self.args
        samples = self.samples

        aggregate(samples,
                  ref_gtf_file=a.ref_gtf_file,
                  gtf_expr_attr=a.gtf_expr_attr,
                  tmp_dir=r.tmp_dir,
                  output_gtf_file=r.transfrags_gtf_file,
                  stats_file=r.aggregate_stats_file)

        # update status and write to file
        self.status.aggregate = True
        self.status.write(self.results.status_file)
Ejemplo n.º 6
0
def calculate_statistics(results):
    """Calculates aggregate statistics for a set of NDT results.

    Calculates aggregate statistics (e.g. mean, median, std dev) for each
    relevant NDT metric (e.g. total test duration, s2c throughput).

    Args:
        results: A list of NdtResult instances for which to calculate aggregate
            statistics.

    Returns:
        A ResultStatistics instance that contains aggregate statistics for each
        NDT metric.
    """
    total_duration = aggregate.aggregate(map(result_metrics.total_duration,
                                             results))
    c2s_duration = aggregate.aggregate(map(result_metrics.c2s_duration,
                                           results))
    s2c_duration = aggregate.aggregate(map(result_metrics.s2c_duration,
                                           results))
    c2s_throughput = aggregate.aggregate(map(
        lambda result: result.c2s_result.throughput, results))
    s2c_throughput = aggregate.aggregate(map(
        lambda result: result.s2c_result.throughput, results))
    latency = aggregate.aggregate(map(lambda result: result.latency, results))
    return ResultStatistics(total_duration, c2s_duration, s2c_duration,
                            c2s_throughput, s2c_throughput, latency)
Ejemplo n.º 7
0
def perform_aggregation(df, freq):
    log.info("Generating summary tables")

    # Limit to values during working hours:
    df = ag.limit_by_hours(df)

    # Perform multi-column aggregation and
    #  extract interesting stats from the aggregate table
    stats = ag.extract_stats(ag.aggregate(df, freq=freq))

    # Iterate each month and tabulate each stats set
    table_list = [ag.tabulate(stats[month]) for month in list(stats.keys())]

    return zip(list(stats.keys()), table_list)
Ejemplo n.º 8
0
def perform_aggregation(df, freq):
    log.info("Generating summary tables")

    # Limit to values during working hours:
    df = ag.limit_by_hours(df)

    # Perform multi-column aggregation and
    #  extract interesting stats from the aggregate table
    stats = ag.extract_stats(ag.aggregate(df, freq=freq))

    # Iterate each month and tabulate each stats set
    table_list = [ag.tabulate(stats[month]) for month in list(stats.keys())]

    return zip(list(stats.keys()), table_list)
Ejemplo n.º 9
0
def fetch_data_http_stream_down(flow):
    "Return a resume of interesting HTTP streaming \
    down flows characteristics."
    flow = flow.view(np.recarray)
    resume = {}
    flows_1MB = flow.compress(flow.l3Bytes > 10**6 )
    vol_dir = aggregate.aggregate(flow, 'direction', 'l3Bytes', sum)
    #resume['vol_up'] = vol_dir[0][1]
    resume['vol_down'] = vol_dir[0][1]
    #resume['total_vol'] = (resume['vol_down'] +
            #resume['vol_up'])
    resume['nb_client'] = len(np.unique(flow.client_id))
    resume['nb_flow'] = len(flow)
    resume['nb_client_1MB'] = len(np.unique(flows_1MB.client_id))
    resume['nb_flow_1MB'] = len(flows_1MB)
    resume['mean_flow_size'] = np.mean(flow.l3Bytes)
    resume['median_flow_size'] = np.median(flow.l3Bytes)
    resume['max_flow_size'] = np.int64(np.max(flow.l3Bytes))
    resume['mean_flow_duration'] = np.mean(flow.duration)
    resume['median_flow_duration'] = np.median(flow.duration)
    resume['max_flow_duration'] = np.max(flow.duration)
    resume['mean_flow_peak_rate'] = np.mean(80.0 * flow.peakRate)
    resume['median_flow_peak_rate'] = np.median(80.0 * flow.peakRate)
    resume['max_flow_peak_rate'] = np.max(80.0 * flow.peakRate)
    mean_rate = [8*x['l3Bytes']/(1000.0*x['duration'])
            for x in flow if x['duration']>0]
    resume['mean_flow_mean_rate'] = np.mean(mean_rate)
    resume['median_flow_mean_rate'] = np.median(mean_rate)
    resume['max_flow_mean_rate'] = np.max(mean_rate)
    mean_rate_1MB = [8*x['l3Bytes']/(1000.0*x['duration'])
            for x in flow if x['duration']>0
            and x['l3Bytes'] > 10**6]
    resume['mean_flow_mean_rate_1MB'] = np.mean(mean_rate_1MB)
    resume['median_flow_mean_rate_1MB'] = np.median(mean_rate_1MB)
    resume['max_flow_mean_rate_1MB'] = np.max(mean_rate_1MB)
    resume['mean_flow_AR'] = \
            compute_AT.compute_AT(flow.initTime)[0]
    resume['mean_flow_100_AR_per_cl'] = \
            100 * resume['mean_flow_AR'] / resume['nb_client']
    return resume
Ejemplo n.º 10
0
def evaluateQuery(query, metadataDict):
    for stmnt_unformated in sqlparse.parse(query):
        statement = sqlparse.parse(sqlparse.format(str(stmnt_unformated)))[0]

    query_tokens = []
    for x in statement.tokens:
        if re.match('([\s]+)', str(x)):
            continue
        else:
            query_tokens.append(str(x))

    #print query_tokens

    distinct_flag = 0
    distinct_flag2 = 0
    if str(query_tokens[1]).lower() == "distinct":
        distinct_flag = 1
    elif "distinct(" in query:
        distinct_flag2 = 1
    #print distinct_flag2

    colNames = query_tokens[1 + distinct_flag].split(",")
    #print colNames
    tableNames = query_tokens[3 + distinct_flag].split(",")
    #print tableNames

    #Error Handling
    error_handling(query, colNames, tableNames)

    #Checking for aggregate function
    func = ["min", "max", "count", "sum", "avg"]
    if any(x in query for x in func):
        aggregate(colNames[0], tableNames[0])
        return

    #reading table data from file
    temp_table_data = []
    table_data = []
    cross = []
    for t in tableNames:
        f = open(t + ".csv", 'r')
        temp_table_data = [line.replace('"', '').strip() for line in f]

        if len(table_data) == 0:
            table_data = temp_table_data
        else:
            for y in temp_table_data:
                for z in table_data:
                    cross.append(z + "," + y)

            table_data = cross
            cross = []
    #print table_data

    #Checking for Where Condition
    index = 4 + distinct_flag
    if len(query_tokens) > index:
        whereCond = ""
        whereCond = query_tokens[index][6:]
        #print whereCond

        table_data = whereEvaluate(whereCond, tableNames, table_data)

    #Projection
    table_data = project(colNames, tableNames, table_data)

    if distinct_flag == 1 or distinct_flag2 == 1:
        table_data = [table_data[0], distinct(table_data[1])]

    # for x in table_data:
    # 	print table_data
    #Printing Output
    print "Output:"
    header = ""
    flag = 0
    for i in table_data[0]:
        if flag == 0:
            header += str(i)
            flag = 1
        else:
            header = header + "," + str(i)
    print header

    for x in table_data[1]:
        flag = 0
        valstr = ""
        if isinstance(x, list):
            for y in x:
                #print y
                if flag == 0:
                    valstr = valstr + str(y)
                    flag = 1
                else:
                    valstr = valstr + "," + str(y)
            #print valstr
        else:
            if flag == 0:
                valstr = valstr + str(x)
                flag = 1
            else:
                valstr = valstr + "," + str(x)
        print valstr
Ejemplo n.º 11
0
def modify_and_fetch_data_named(resume, flows, name):
    "Modify a dictonnary resume, to extend it with a GVB array with a specifier name. "
    vol_dir = aggregate.aggregate(flows, 'direction', 'l3Bytes', sum)
    resume['vol_up_%s' % name] = vol_dir[0][1]
    resume['vol_down_%s' % name] = vol_dir[1][1]
    resume['vol_tot_%s' % name] = resume['vol_down_%s' % name] + resume['vol_up_%s' % name]

    vol_dscp = aggregate.aggregate(flows, 'dscp', 'l3Bytes', sum)
    resume['vol_down_web_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_WEB)
    resume['vol_down_http_stream_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['vol_down_other_stream_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)

    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    resume['nb_down_flows_tot_%s' % name] = np.shape(flows_down)[0]

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB )
    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM )
    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM )
    resume['nb_down_flows_web_%s' % name] = np.shape(flows_down_web)[0]
    resume['nb_down_flows_http_stream_%s' % name] = np.shape(flows_down_http_stream)[0]
    resume['nb_down_flows_other_stream_%s' % name] = np.shape(flows_down_other_stream)[0]

    resume['vol_down_per_flow_tot_%s' % name] = resume['vol_down_%s' % name] / resume['nb_down_flows_tot_%s' % name]
    resume['vol_down_per_flow_web_%s' % name] = resume['vol_down_web_%s' % name] / resume['nb_down_flows_web_%s' % name]
    resume['vol_down_per_flow_http_stream_%s' % name] = resume['vol_down_http_stream_%s' % name] / resume['nb_down_flows_http_stream_%s' % name]
    resume['vol_down_per_flow_other_stream_%s' % name] = resume['vol_down_other_stream_%s' % name] / resume['nb_down_flows_other_stream_%s' % name]

    resume['nb_clients_tot_%s' % name] = np.shape(np.unique(flows_down.client_id))[0]
    resume['nb_clients_web_%s' % name] = np.shape(np.unique(flows_down_web.client_id))[0]
    resume['nb_clients_http_stream_%s' % name] = np.shape(np.unique(flows_down_http_stream.client_id))[0]
    resume['nb_clients_other_stream_%s' % name] = np.shape(np.unique(flows_down_other_stream.client_id))[0]

    resume['vol_down_per_client_tot_%s' % name] = resume['vol_down_%s' % name] / resume['nb_clients_tot_%s' % name]
    resume['vol_down_per_client_web_%s' % name] = resume['vol_down_web_%s' % name] / resume['nb_clients_web_%s' % name]
    resume['vol_down_per_client_http_stream_%s' % name] = resume['vol_down_http_stream_%s' % name] / resume['nb_clients_http_stream_%s' % name]
    resume['vol_down_per_client_other_stream_%s' % name] = resume['vol_down_other_stream_%s' % name] / resume['nb_clients_other_stream_%s' % name]


    flows_down_1MB = flows_down.compress(flows_down.l3Bytes > 10**6 )
    flows_down_1MB_dscp = aggregate.aggregate(flows_down_1MB, 'dscp', 'l3Bytes', len)

    flows_down_1MB_web = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                 == INDEX_VALUES.DSCP_WEB )
    flows_down_1MB_http_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                         == INDEX_VALUES.DSCP_HTTP_STREAM )
    flows_down_1MB_other_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                          == INDEX_VALUES.DSCP_OTHER_STREAM )

    resume['nb_clients_1MB_tot_%s' % name] = np.shape(np.unique(flows_down_1MB.client_id))[0]
    resume['nb_clients_1MB_web_%s' % name] = np.shape(np.unique(flows_down_1MB_web.client_id))[0]
    resume['nb_clients_1MB_http_stream_%s' % name] = np.shape(np.unique(flows_down_1MB_http_stream.client_id))[0]
    resume['nb_clients_1MB_other_stream_%s' % name] = np.shape(np.unique(flows_down_1MB_other_stream.client_id))[0]

    resume['nb_down_flows_1MB_tot_%s' % name] = np.shape(flows_down_1MB)[0]
    resume['nb_down_flows_1MB_web_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                               'dscp', INDEX_VALUES.DSCP_WEB)
    resume['nb_down_flows_1MB_http_stream_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                       'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['nb_down_flows_1MB_other_stream_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                        'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)
    
    return resume 
assessor = pd.read_sql(
    'select * from aux.assessor_summary b join aux.addresses a using(address)',
    engine)
acs = pd.read_sql('select geo_id2 as census_tract_id, * from aux.acs', engine)
wt = pd.read_sql('select * from aux.ward_tracts', engine)

for level in ['tracts', 'wards']:
    if level == 'wards':
        acs_level = wt.merge(acs, on='census_tract_id', how='left')
        index = 'ward_id'
        weight = acs_level['area']
    else:
        acs_level = acs
        index = 'census_tract_id'
        weight = None

    acs_ag = a.aggregate(acs_level, columns.acs, weight, index)
    buildings_ag = a.aggregate(buildings, columns.building, index=index)
    assessor_ag = a.aggregate(assessor, columns.assessor, index=index)

    acs_ag.columns = ['acs_' + c for c in acs_ag.columns]
    assessor_ag.columns = ['assessor_' + c for c in assessor_ag.columns]
    buildings_ag.columns = ['buildings_' + c for c in buildings_ag.columns]

    ag = acs_ag.join(assessor_ag, how='outer')
    ag = ag.join(buildings_ag, how='outer')

    # to_sql using wrong datatype when writing index as such. can specify dtype with pandas .15.2
    ag.reset_index(inplace=True)
    ag.to_sql(level, engine, if_exists='replace', schema='output', index=False)
Ejemplo n.º 13
0
def fetch_data_general(in_flow, filtered=False):
    "Return a resume of interesting \
    flows characteristics."

#            ('nb_down_flows_1MB%s', 'Nb', '.4g'),
#            ('avg_vol_down_per_flow%s', 'Bytes', '.4g'),
#            ('avg_vol_down_per_client%s', 'Bytes', '.4g')]

    #new_flows = {}
    resume = {}
    for app in ('', '_WEB', '_HTTP_STREAM', '_OTHER_STREAM'):
        if app == '':
            new_flow = in_flow #['data%s' % app]
        else:
            dscp = get_dscp(app, in_flow, filtered=filtered)
            new_flow = in_flow.compress(in_flow['dscp'] == dscp)
            #['data%s' % app]
        resume['App: %s' % app] = ''
        new_flow = new_flow.view(np.recarray)
        new_flow_down = new_flow.compress(new_flow.direction
                == INDEX_VALUES.DOWN)
        new_flow_1MB = new_flow.compress(new_flow.l3Bytes > 10**6)
        new_flow_down_1MB = new_flow_down.compress(new_flow_down.l3Bytes
                > 10**6)
        vol_dir = aggregate.aggregate(new_flow, 'direction', 'l3Bytes', sum)
#        resume['vol_up%s' % app] = vol_dir[0][1]
#        resume['vol_down%s' % app] = vol_dir[1][1]
        try:
            resume['vol_up%s' % app] = vol_u = vol_dir[0][1]
        except IndexError:
            resume['vol_up%s' % app] = vol_u = float(0)
        try:
            resume['vol_down%s' % app] = vol_d = vol_dir[1][1]
        except IndexError:
            resume['vol_down%s' % app] = vol_d = float(0)
        resume['total_vol%s' % app] = vol_u + vol_d
        resume['nb_flows_down%s' % app] = nb_fl = len(new_flow_down)
        resume['nb_flows_down_1MB%s' % app] = nb_fl_1mb = len(new_flow_down_1MB)
        resume['ratio_nb_flows%s' % app] = int(100 * nb_fl_1mb / float(nb_fl)) \
                if (resume['nb_flows_down%s' % app] != 0) else 0
        resume['nb_client_down%s' % app] = nb_cl = len(np.unique(
            new_flow_down.client_id))
        resume['avg_vol_down_per_client%s' % app] = vol_d / nb_cl \
                if (nb_cl != 0) else 0
        resume['avg_vol_down_per_flow%s' % app] = vol_d / nb_fl \
                if (nb_fl != 0) else 0
        resume['avg_vol_up_per_client%s' % app] = vol_u / nb_cl \
                if (nb_cl != 0) else 0
        resume['avg_vol_up_per_flow%s' % app] = vol_u / nb_fl \
                if (nb_fl != 0) else 0
        resume['nb_client_down_1MB%s' % app] = len(np.unique(
            new_flow_down_1MB.client_id))
        resume['avg_nb_flows_per_client%s' % app] = nb_fl / float(nb_cl) \
                if nb_cl !=0 else 0
        resume['avg_nb_flows_1MB_per_client%s' % app] = nb_fl_1mb / float(nb_cl) \
                if nb_cl !=0 else 0
#    resume['nb_flow'] = len(flow)
#    resume['nb_client_1MB'] = len(np.unique(flows_1MB.client_id))
#    resume['nb_flow_1MB'] = len(flows_1MB)
#    resume['mean_flow_size'] = np.mean(flow.l3Bytes)
#    resume['median_flow_size'] = np.median(flow.l3Bytes)
#    resume['max_flow_size'] = np.int64(np.max(flow.l3Bytes))
#    resume['mean_flow_duration'] = np.mean(flow.duration)
#    resume['median_flow_duration'] = np.median(flow.duration)
#    resume['max_flow_duration'] = np.max(flow.duration)
#    resume['mean_flow_peak_rate'] = np.mean(80.0 * flow.peakRate)
#    resume['median_flow_peak_rate'] = np.median(80.0 * flow.peakRate)
#    resume['max_flow_peak_rate'] = np.max(80.0 * flow.peakRate)
#    mean_rate = [8*x['l3Bytes']/(1000.0*x['duration'])
#            for x in flow if x['duration']>0]
#    resume['mean_flow_mean_rate'] = np.mean(mean_rate)
#    resume['median_flow_mean_rate'] = np.median(mean_rate)
#    resume['max_flow_mean_rate'] = np.max(mean_rate)
#    mean_rate_1MB = [8*x['l3Bytes']/(1000.0*x['duration'])
#            for x in flow if x['duration']>0
#            and x['l3Bytes'] > 10**6]
#    resume['mean_flow_mean_rate_1MB'] = np.mean(mean_rate_1MB)
#    resume['median_flow_mean_rate_1MB'] = np.median(mean_rate_1MB)
#    resume['max_flow_mean_rate_1MB'] = np.max(mean_rate_1MB)
#    resume['mean_flow_AR'] = \
#            compute_AT.compute_AT(flow.initTime)[0]
#    resume['mean_flow_100_AR_per_cl'] = \
#            100 * resume['mean_flow_AR'] / resume['nb_client']
    return resume
import sys
from aggregate import aggregate


bucket_size = 1
line_filter = None

if len(sys.argv) > 1:
    bucket_size = int(sys.argv[1])

if len(sys.argv) > 2:
    line_filter = sys.argv[2]


res = aggregate(sys.stdin, bucket_size, line_filter)

for datetime, count in res:
    print(datetime.strftime("%s"), count, sep="\t")
Ejemplo n.º 15
0
def strollr2d_imagedenoising(data, param):
    """
    This is the entrypoint function for the imagedenoising. The input parameters
    are defined as

    data: A python object containing the image data, the structure contains two fields
    - noisy: a*b size gray-scale image matrix for denoising.
    - oracle (optional): a*b size gray-scale matrix as the ground truth for calculation
    of PSNR.

    param: Structure containing parameters for the algorithm.
    """
    try:
        noisy = data['noisy']
        oracle = data.get('oracle', None)

        sig = param['sig']
        dim = param['dim']
        # Kronecker product
        # dct(np.eye(8), axis=0) is the cosine transform of order 8
        W = np.kron(dct(np.eye(dim), axis=0, norm='ortho'), dct(np.eye(dim), axis=0, norm='ortho'))
        threshold = param['TLthr0'] * sig
        param['threshold'] = threshold

        thr = param['thr0'] * sig
        param['thr'] = thr

        print('[+] Parameters loaded')
        noisy, param = image_enlarge_tl(noisy, param)
        print('[+] Image Enlarged for TL ONLY')

        patchNoisy = image_patch(noisy, dim)
        print('[+] Image patch done')
        patches = patchNoisy

        # patchNoisy is a 2D numpy array
        numTensorPatch = patchNoisy.shape[1]
        param['numTensorPatch'] = numTensorPatch

        W, sparseCode, nonZeroTable = tl_approximation(patches, W, param)
        print('[+] Module TL approx done')

        nonZeroTable[nonZeroTable == 0] = param['zeroWeight']
        TLsparsityWeight = np.divide(1, nonZeroTable)
        blk_arr, _, blk_pSize = bm_fix(patches, param)
        print('[+] Module BM fix done')
        blk_arr = np.asarray(blk_arr)
        blk_pSize = np.asarray(blk_pSize)

        LRpatch, LRweights, LRrankWeight = lr_approximation(patches, blk_arr, blk_pSize, param)
        print('[+] Module LRapprox done')
        nonZerosLR = LRweights > 0
        LRrankWeight[nonZerosLR] = np.divide(LRrankWeight[nonZerosLR], LRweights[nonZerosLR])

        patchRecon = f1_reconstruction(sparseCode, W, LRpatch, LRweights, patches, param, TLsparsityWeight, LRrankWeight)
        print('[+] Module F1 Reconstruction done')
        Xr = aggregate(patchRecon, TLsparsityWeight, param)

        plt.imshow(Xr, cmap='gray', vmin=0, vmax=255)
        plt.show()

        psnrXr = PSNR(Xr - oracle)
        print('[+] PSNR value is : {}'.format(psnrXr))
        return Xr, psnrXr

    except KeyError as e:
        print('The parameter provided to strollr2d are not valid: {}'.format(e))
        sys.exit(1)
import sys
import matplotlib.pyplot as plt
from aggregate import aggregate


bucket_size = 1

if len(sys.argv) > 1:
    bucket_size = int(sys.argv[1])


res = aggregate(sys.stdin, bucket_size)


plt.plot(*zip(*res))
plt.show()
Ejemplo n.º 17
0
def execution(result):
    # 获取起止日期
    result = result.split('~')
    date_begin = result[0]
    date_end = result[-1]
    date_end = datetime.datetime.strptime(date_end, "%Y-%m-%d")

    # 获取所有选中日期
    day_range = []
    date = datetime.datetime.strptime(date_begin, "%Y-%m-%d")
    while date <= date_end:
        day_range.append(date.strftime("%Y-%m-%d"))
        date = date + datetime.timedelta(days=1)
    # download the raw data
    date = datetime.datetime.strptime(date_begin, "%Y-%m-%d")
    web.update(date.strftime("%Y-%m-%d"))

    while (date + datetime.timedelta(days=7)) <= date_end:
        date = date + datetime.timedelta(days=7)
        web.update(date.strftime("%Y-%m-%d"))

    # 删除选中日期以外的文件
    path = '../Download/'
    word = 'BusLocation'
    Bus_dir = web.search_dir(path, word)
    for i in Bus_dir:
        web.del_file(i, day_range)
    word = 'Session'
    Session_dir = web.search_dir(path, word)
    for i in Session_dir:
        web.del_file(i, day_range)

    # service recognition
    word = 'BusLocation'
    Bus_dir = web.search_dir(path, word)
    for filedir_bus in Bus_dir:
        #### Modify folderpath ####
        aggregate.aggregate(filedir_bus)

    # ridership analysis
    database = "ridership"
    user = "******"
    password = "******"
    host = "localhost"
    port = "5432"
    ridership.ridership(database, user, password, host, port)

    # round trip time
    folderpath = '../output'
    database = "RTT"
    extractMobilityInfo.run_RTT(folderpath)
    upload.RTT_upload(database, user, password, host, port)

    # interarrival
    database = "inter_arrival"
    mobilityInterval.run_interarrival(folderpath)
    upload.inter_arrival_upload(database, user, password, host, port)

    web.clean()
    print('COMPLETED!')
    return
Ejemplo n.º 18
0
"""This module pulls data from each of the sources and generates the aggregate projections for the week."""
import sys

import aggregate
from scrapers import dailyfantasynerd, espn, nfl, numberfire, rotogrinders


if __name__ == "__main__":

    year = sys.argv[1]
    week = sys.argv[2]

    nfl.scrape(week, year)
    espn.scrape(week, year)
    numberfire.scrape(week, year)
    rotogrinders.scrape(week, year)
    dailyfantasynerd.scrape(week, year)

    aggregate.aggregate(week, year)
Ejemplo n.º 19
0
def vol_per_client(data, as_list=None, as_excluded=None, on_list=False,
    field='l3Bytes', func=sum,
    output_path = 'rapport/client_ok', title='', prefix = ''#,
#    trace_list = ('ADSL_2008', 'FTTH_2008', 'ADSL_nov_2009', 'FTTH_nov_2009',
#        'ADSL_dec_2009', 'FTTH_dec_2009')
        ):
    """Plots volumes per clients according to AS match list:
    use * for all ASes.
    flag 'on_list' works only on AS_list (included AS) and AS_list elements are
    filters and names: see exemples
    Use as:
    data = tools.load_hdf5_data.load_h5_file('hdf5/lzf_data.h5')
    tools.plot_per_client.vol_per_client(data)
    tools.plot_per_client.vol_per_client(data,
        ('*', tools.INDEX_VALUES.AS_YOUTUBE))
    tools.plot_per_client.vol_per_client(data,
        as_excluded=tools.INDEX_VALUES.AS_YOUTUBE
        +tools.INDEX_VALUES.AS_YOUTUBE_EU,
        title='Other Streams', prefix='OTHER_')
    tools.plot_per_client.vol_per_client(data_streaming,
        as_list=((tools.INDEX_VALUES.AS_YOUTUBE, 'YOUTUBE'),
        (tools.INDEX_VALUES.AS_YOUTUBE_EU, 'YOUTUBE_EU')),
        title='YT and YT-EU Streams', prefix='YT_YT_EU_', on_list=True,
        output_path='rapport/client_ok')
    tools.plot_per_client.vol_per_client(data,
        as_list=((tools.INDEX_VALUES.AS_YOUTUBE, 'YOUTUBE'),
        (tools.INDEX_VALUES.AS_YOUTUBE_EU, 'YOUTUBE_EU'),
        (tools.INDEX_VALUES.AS_GOOGLE, 'GOOGLE')),
        title='YT and GOO Streams', prefix='YT_GOO', on_list=True,
        output_path='rapport/client_ok')
    """
    client_vol = {}
    # data collection
    args = []
    # TODO: AS list
    for trace in sorted([x for x in data.keys() if '_GVB' in x]):
        print 'process trace: ', trace
        filtered_data_dict = defaultdict(dict)
        if on_list:
            filtered_data_dict[trace] = filter_array_list(data, trace,
                    as_list, as_excluded)
        else:
            filtered_data_dict[trace][trace] = filter_array(data[trace],
                    'asBGP', as_list, as_excluded)
        for name in sorted(filtered_data_dict[trace]):
            filtered_data = filtered_data_dict[trace][name]
            # at least MIN_NB_FLOWS flows per data to plot
            if len(filtered_data) < MIN_NB_FLOWS:
                continue
            client_vol[name] = aggregate.aggregate(filtered_data,
                    'client_id', field, func)
            # construct plot args
            if as_list:
                title_name = format_as_title(name)
            else:
                title_name = format_title(name).rstrip(' GVB')
            args.append((title_name, client_vol[name]['aggregation']))
            # plot individual repartitions
            pylab.clf()
            cdfplot.repartplotdata(client_vol[name]['aggregation'],
                _title='%s Volume per Client for %s' % (title, trace),
                _ylabel='Percentage of Downstream Volume', _loc=0)
            cdfplot.setgraph_loglog()
            pylab.savefig(output_path
                + '/%s%s_repart_volume_per_client.pdf' % (prefix, trace))
    # plot CDF
    pylab.clf()
    cdfplot.cdfplotdataN(args, _title='%s Volume per Client' % title,
                         _xlabel='Downstream Volume in Bytes', _loc=0)
    pylab.savefig(output_path + '/%sCDF_volume_per_client.pdf' % prefix)

    # plot global repartition
    pylab.clf()
    cdfplot.repartplotdataN(args, _title='%s Volume per Client' % title,
            _ylabel='Percentage of Downstream Volume', _loc=0)
    pylab.savefig(output_path + '/%srepart_volume_per_client.pdf' % prefix)