Python aggregate 예제들, aggregate.aggregate Python 예제들

예제 #1

0

파일 보기

파일: top_AS.py 프로젝트: LouisPlisso/analysis_tools

def top_bgp_data(flows):
    "Return top AS of GVB flow recarray in the form of a dictonnary."
    resume = {}
    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    flows_down_as = aggregate.aggregate(flows_down, 'asBGP', 'l3Bytes', sum)
    flows_down_as.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_%d' % i], resume['vol_as_down_%d' % i]) = flows_down_as[-(i+1)]
    resume['total_other_down_as'] = np.sum(flows_down_as[:-10][:].aggregation)

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB)
    flows_down_as_web = aggregate.aggregate(flows_down_web, 'asBGP', 'l3Bytes', sum)
    flows_down_as_web.sort(order='aggregation') 
    for i in range(11):
        (resume['name_as_down_web_%d' % i], resume['vol_as_down_web_%d' % i]) = flows_down_as_web[-(i+1)]
    resume['total_other_as_down_web'] = np.sum(flows_down_as_web[:-10][:].aggregation)

    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM)
    flows_down_as_other_stream = aggregate.aggregate(flows_down_other_stream, 'asBGP', 'l3Bytes', sum)
    flows_down_as_other_stream.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_other_stream_%d' % i], resume['vol_as_down_other_stream_%d' % i]) \
            = flows_down_as_other_stream[-(i+1)]
    resume['total_other_as_down_other_stream'] = np.sum(flows_down_as_other_stream[:-10][:].aggregation)

    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM)
    flows_down_as_http_stream = aggregate.aggregate(flows_down_http_stream, 'asBGP', 'l3Bytes', sum)
    flows_down_as_http_stream.sort(order='aggregation')
    for i in range(11):
        (resume['name_as_down_http_stream_%d' % i], resume['vol_as_down_http_stream_%d' % i]) \
            = flows_down_as_http_stream[-(i+1)]
    resume['total_other_as_down_http_stream'] = np.sum(flows_down_as_http_stream[:-10][:].aggregation)

    return resume

예제 #2

0

파일 보기

파일: fetch_values.py 프로젝트: LouisPlisso/analysis_tools

def fetch_data(flows):
    "Return a resume of GVB flow recarray in the form of a dictonnary."
    resume = {}
    vol_dir = aggregate.aggregate(flows, 'direction', 'l3Bytes', sum)
    resume['vol_up'] = vol_dir[0][1]
    resume['vol_down'] = vol_dir[1][1]
    resume['vol_tot'] = resume['vol_down'] + resume['vol_up']

    vol_dscp = aggregate.aggregate(flows, 'dscp', 'l3Bytes', sum)
    resume['vol_down_web'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_WEB)
    resume['vol_down_http_stream'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['vol_down_other_stream'] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)

    #to check nb of flow value
#    nb_flows_dir = aggregate.aggregate(flows, 'direction', 'client_id', len)
#    nb_flows_up = list(nb_flows_dir[0])[1]
#    nb_flows_down = list(nb_flows_dir[1])[1]
    
    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    resume['nb_down_flows_tot'] = np.shape(flows_down)[0]

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB )
    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM )
    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM )
    resume['nb_down_flows_web'] = np.shape(flows_down_web)[0]
    resume['nb_down_flows_http_stream'] = np.shape(flows_down_http_stream)[0]
    resume['nb_down_flows_other_stream'] = np.shape(flows_down_other_stream)[0]

    resume['nb_clients_tot'] = np.shape(np.unique(flows_down.client_id))[0]
    resume['nb_clients_web'] = np.shape(np.unique(flows_down_web.client_id))[0]
    resume['nb_clients_http_stream'] = np.shape(np.unique(flows_down_http_stream.client_id))[0]
    resume['nb_clients_other_stream'] = np.shape(np.unique(flows_down_other_stream.client_id))[0]

    flows_down_1MB = flows_down.compress(flows_down.l3Bytes > 10**6 )
    flows_down_1MB_dscp = aggregate.aggregate(flows_down_1MB, 'dscp', 'l3Bytes', len)

    flows_down_1MB_web = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                 == INDEX_VALUES.DSCP_WEB )
    flows_down_1MB_http_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                         == INDEX_VALUES.DSCP_HTTP_STREAM )
    flows_down_1MB_other_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                          == INDEX_VALUES.DSCP_OTHER_STREAM )

    resume['nb_clients_1MB_tot'] = np.shape(np.unique(flows_down_1MB.client_id))[0]
    resume['nb_clients_1MB_web'] = np.shape(np.unique(flows_down_1MB_web.client_id))[0]
    resume['nb_clients_1MB_http_stream'] = np.shape(np.unique(flows_down_1MB_http_stream.client_id))[0]
    resume['nb_clients_1MB_other_stream'] = np.shape(np.unique(flows_down_1MB_other_stream.client_id))[0]

    resume['nb_down_flows_1MB_tot'] = np.shape(flows_down_1MB)[0]
    resume['nb_down_flows_1MB_web'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                               'dscp', INDEX_VALUES.DSCP_WEB)
    resume['nb_down_flows_1MB_http_stream'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                       'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['nb_down_flows_1MB_other_stream'] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                        'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)
    
    return resume

예제 #3

0

파일 보기

파일: process.py 프로젝트: gregjd/misc-tools

def process(input_file, output_name, var_map, calc=None, agg_areas=True):

    def _add_pct(data_frame):

        var_list = data_frame.columns.tolist()
        for var in GEO_COLUMNS + ['area']:
            if var in var_list:
                var_list.remove(var)
        
        return pct.add_percentages(data_frame, var_list, var_list[0])

    def _export(data_frame, suffix, include_index=False):

        full_name = output_name + '_' + suffix + '.csv'
        data_frame.to_csv(full_name, index=include_index)
        print('Saved file: ' + full_name)

        return

    # Clean municipality data
    data = cd.clean_data(input_file)
    data_new = data[GEO_COLUMNS + sorted(var_map.keys())]
    data_new = data_new.rename(columns=var_map)

    # Perform any extra necessary calculations
    if calc:
        data_new = calc(data_new)

    # Aggregate
    if agg_areas:
        data_agg = agg.aggregate(data_new)
        data_ri = agg.aggregate(data_new, agg_var=(lambda x: True))

    # Calculate percentages
    data_new_w_pct = _add_pct(data_new)
    if agg_areas:
        data_agg_w_pct = _add_pct(data_agg)
        data_ri_w_pct = _add_pct(data_ri.drop('area', axis=1))

    # Export to CSV
    _export(data_new_w_pct, 'munis')
    if agg_areas:
        _export(data_agg_w_pct, 'areas', include_index=True)
        _export(data_ri_w_pct, 'state')
        return (data_new_w_pct, data_agg_w_pct, data_ri_w_pct)
    else:
        return (data_new_w_pct,)

예제 #4

0

파일 보기

파일: main.py 프로젝트: insperatum/mws_for_lucas

def evaluate():
    if args.evaluate == "slurm" and args.evaluate_checkpoint is None:
        ckpt = "./checkpoint-%d.p" % model.iteration
        save(ckpt)
        slurmcmd = "bash -c 'p=$(./slurmparams) sbatch $p --time=240 -J evaluate -o evaluate_" + str(
            model.iteration) + ".out om-run python " + " ".join(
                sys.argv) + " --evaluate-checkpoint=" + ckpt + "'"
        print(slurmcmd)
        os.system(slurmcmd)
    else:
        M.evaluate(model)
        evaluate_start = time.time()
        elbo, kl = getELBo()
        print("elbo:", elbo)
        print("KL:", kl)
        n_classification_samples = args.classification_samples
        classification_20_way, predictive = getClassification(
            20, n_classification_samples)
        print("20-way Accuracy:",
              "%5.2f" % (classification_20_way * 100) + "%",
              flush=True)
        precision_20_way = math.sqrt(classification_20_way *
                                     (1 - classification_20_way) /
                                     n_classification_samples)
        #classification_100_way, _ = getClassification(100, n_classification_samples)
        #print("100-way Accuracy:", "%5.2f" % (classification_100_way*100) + "%", flush=True)
        #precision_100_way = math.sqrt(classification_100_way * (1-classification_100_way) / n_classification_samples)
        print("Evaluate took a total of:",
              int((time.time() - evaluate_start) / 60), "minutes")
        ev = {
            '20-way': classification_20_way,
            #'100-way':classification_100_way,
            'precision-20-way': precision_20_way,
            #'precision-100-way':precision_100_way,
            #'predictive':predictive,
            'ELBo': elbo,
            'kl': kl,
            'time': model.wallclock,
            'iteration': model.iteration
        }
        with open("evaluate_%d.p" % model.iteration, "wb") as f:
            pickle.dump(ev, f)
        aggregate()
        #model.history.append(ev)
        return ev

예제 #5

0

파일 보기

파일: run.py 프로젝트: balajipandian/taco

    def aggregate(self):
        '''
        Aggregate/merge individual sample GTF files
        '''
        r = self.results
        a = self.args
        samples = self.samples

        aggregate(samples,
                  ref_gtf_file=a.ref_gtf_file,
                  gtf_expr_attr=a.gtf_expr_attr,
                  tmp_dir=r.tmp_dir,
                  output_gtf_file=r.transfrags_gtf_file,
                  stats_file=r.aggregate_stats_file)

        # update status and write to file
        self.status.aggregate = True
        self.status.write(self.results.status_file)

예제 #6

0

파일 보기

파일: result_statistics.py 프로젝트: m-lab/ndt-e2e-testmaster

def calculate_statistics(results):
    """Calculates aggregate statistics for a set of NDT results.

    Calculates aggregate statistics (e.g. mean, median, std dev) for each
    relevant NDT metric (e.g. total test duration, s2c throughput).

    Args:
        results: A list of NdtResult instances for which to calculate aggregate
            statistics.

    Returns:
        A ResultStatistics instance that contains aggregate statistics for each
        NDT metric.
    """
    total_duration = aggregate.aggregate(map(result_metrics.total_duration,
                                             results))
    c2s_duration = aggregate.aggregate(map(result_metrics.c2s_duration,
                                           results))
    s2c_duration = aggregate.aggregate(map(result_metrics.s2c_duration,
                                           results))
    c2s_throughput = aggregate.aggregate(map(
        lambda result: result.c2s_result.throughput, results))
    s2c_throughput = aggregate.aggregate(map(
        lambda result: result.s2c_result.throughput, results))
    latency = aggregate.aggregate(map(lambda result: result.latency, results))
    return ResultStatistics(total_duration, c2s_duration, s2c_duration,
                            c2s_throughput, s2c_throughput, latency)

예제 #7

0

파일 보기

파일: report.py 프로젝트: sjmf/reportgen

def perform_aggregation(df, freq):
    log.info("Generating summary tables")

    # Limit to values during working hours:
    df = ag.limit_by_hours(df)

    # Perform multi-column aggregation and
    #  extract interesting stats from the aggregate table
    stats = ag.extract_stats(ag.aggregate(df, freq=freq))

    # Iterate each month and tabulate each stats set
    table_list = [ag.tabulate(stats[month]) for month in list(stats.keys())]

    return zip(list(stats.keys()), table_list)

예제 #8

0

파일 보기

파일: report.py 프로젝트: sjmf/reportgen

def perform_aggregation(df, freq):
    log.info("Generating summary tables")

    # Limit to values during working hours:
    df = ag.limit_by_hours(df)

    # Perform multi-column aggregation and
    #  extract interesting stats from the aggregate table
    stats = ag.extract_stats(ag.aggregate(df, freq=freq))

    # Iterate each month and tabulate each stats set
    table_list = [ag.tabulate(stats[month]) for month in list(stats.keys())]

    return zip(list(stats.keys()), table_list)

예제 #9

0

파일 보기

파일: auto_latex_flows.py 프로젝트: LouisPlisso/analysis_tools

def fetch_data_http_stream_down(flow):
    "Return a resume of interesting HTTP streaming \
    down flows characteristics."
    flow = flow.view(np.recarray)
    resume = {}
    flows_1MB = flow.compress(flow.l3Bytes > 10**6 )
    vol_dir = aggregate.aggregate(flow, 'direction', 'l3Bytes', sum)
    #resume['vol_up'] = vol_dir[0][1]
    resume['vol_down'] = vol_dir[0][1]
    #resume['total_vol'] = (resume['vol_down'] +
            #resume['vol_up'])
    resume['nb_client'] = len(np.unique(flow.client_id))
    resume['nb_flow'] = len(flow)
    resume['nb_client_1MB'] = len(np.unique(flows_1MB.client_id))
    resume['nb_flow_1MB'] = len(flows_1MB)
    resume['mean_flow_size'] = np.mean(flow.l3Bytes)
    resume['median_flow_size'] = np.median(flow.l3Bytes)
    resume['max_flow_size'] = np.int64(np.max(flow.l3Bytes))
    resume['mean_flow_duration'] = np.mean(flow.duration)
    resume['median_flow_duration'] = np.median(flow.duration)
    resume['max_flow_duration'] = np.max(flow.duration)
    resume['mean_flow_peak_rate'] = np.mean(80.0 * flow.peakRate)
    resume['median_flow_peak_rate'] = np.median(80.0 * flow.peakRate)
    resume['max_flow_peak_rate'] = np.max(80.0 * flow.peakRate)
    mean_rate = [8*x['l3Bytes']/(1000.0*x['duration'])
            for x in flow if x['duration']>0]
    resume['mean_flow_mean_rate'] = np.mean(mean_rate)
    resume['median_flow_mean_rate'] = np.median(mean_rate)
    resume['max_flow_mean_rate'] = np.max(mean_rate)
    mean_rate_1MB = [8*x['l3Bytes']/(1000.0*x['duration'])
            for x in flow if x['duration']>0
            and x['l3Bytes'] > 10**6]
    resume['mean_flow_mean_rate_1MB'] = np.mean(mean_rate_1MB)
    resume['median_flow_mean_rate_1MB'] = np.median(mean_rate_1MB)
    resume['max_flow_mean_rate_1MB'] = np.max(mean_rate_1MB)
    resume['mean_flow_AR'] = \
            compute_AT.compute_AT(flow.initTime)[0]
    resume['mean_flow_100_AR_per_cl'] = \
            100 * resume['mean_flow_AR'] / resume['nb_client']
    return resume

예제 #10

0

파일 보기

def evaluateQuery(query, metadataDict):
    for stmnt_unformated in sqlparse.parse(query):
        statement = sqlparse.parse(sqlparse.format(str(stmnt_unformated)))[0]

    query_tokens = []
    for x in statement.tokens:
        if re.match('([\s]+)', str(x)):
            continue
        else:
            query_tokens.append(str(x))

    #print query_tokens

    distinct_flag = 0
    distinct_flag2 = 0
    if str(query_tokens[1]).lower() == "distinct":
        distinct_flag = 1
    elif "distinct(" in query:
        distinct_flag2 = 1
    #print distinct_flag2

    colNames = query_tokens[1 + distinct_flag].split(",")
    #print colNames
    tableNames = query_tokens[3 + distinct_flag].split(",")
    #print tableNames

    #Error Handling
    error_handling(query, colNames, tableNames)

    #Checking for aggregate function
    func = ["min", "max", "count", "sum", "avg"]
    if any(x in query for x in func):
        aggregate(colNames[0], tableNames[0])
        return

    #reading table data from file
    temp_table_data = []
    table_data = []
    cross = []
    for t in tableNames:
        f = open(t + ".csv", 'r')
        temp_table_data = [line.replace('"', '').strip() for line in f]

        if len(table_data) == 0:
            table_data = temp_table_data
        else:
            for y in temp_table_data:
                for z in table_data:
                    cross.append(z + "," + y)

            table_data = cross
            cross = []
    #print table_data

    #Checking for Where Condition
    index = 4 + distinct_flag
    if len(query_tokens) > index:
        whereCond = ""
        whereCond = query_tokens[index][6:]
        #print whereCond

        table_data = whereEvaluate(whereCond, tableNames, table_data)

    #Projection
    table_data = project(colNames, tableNames, table_data)

    if distinct_flag == 1 or distinct_flag2 == 1:
        table_data = [table_data[0], distinct(table_data[1])]

    # for x in table_data:
    # 	print table_data
    #Printing Output
    print "Output:"
    header = ""
    flag = 0
    for i in table_data[0]:
        if flag == 0:
            header += str(i)
            flag = 1
        else:
            header = header + "," + str(i)
    print header

    for x in table_data[1]:
        flag = 0
        valstr = ""
        if isinstance(x, list):
            for y in x:
                #print y
                if flag == 0:
                    valstr = valstr + str(y)
                    flag = 1
                else:
                    valstr = valstr + "," + str(y)
            #print valstr
        else:
            if flag == 0:
                valstr = valstr + str(x)
                flag = 1
            else:
                valstr = valstr + "," + str(x)
        print valstr

예제 #11

0

파일 보기

파일: fetch_values.py 프로젝트: LouisPlisso/analysis_tools

def modify_and_fetch_data_named(resume, flows, name):
    "Modify a dictonnary resume, to extend it with a GVB array with a specifier name. "
    vol_dir = aggregate.aggregate(flows, 'direction', 'l3Bytes', sum)
    resume['vol_up_%s' % name] = vol_dir[0][1]
    resume['vol_down_%s' % name] = vol_dir[1][1]
    resume['vol_tot_%s' % name] = resume['vol_down_%s' % name] + resume['vol_up_%s' % name]

    vol_dscp = aggregate.aggregate(flows, 'dscp', 'l3Bytes', sum)
    resume['vol_down_web_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_WEB)
    resume['vol_down_http_stream_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['vol_down_other_stream_%s' % name] = extract_aggregated_field(vol_dscp, 'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)

    flows_down = flows.compress(flows.direction == INDEX_VALUES.DOWN)
    resume['nb_down_flows_tot_%s' % name] = np.shape(flows_down)[0]

    flows_down_web = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_WEB )
    flows_down_other_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_OTHER_STREAM )
    flows_down_http_stream = flows_down.compress(flows_down.dscp == INDEX_VALUES.DSCP_HTTP_STREAM )
    resume['nb_down_flows_web_%s' % name] = np.shape(flows_down_web)[0]
    resume['nb_down_flows_http_stream_%s' % name] = np.shape(flows_down_http_stream)[0]
    resume['nb_down_flows_other_stream_%s' % name] = np.shape(flows_down_other_stream)[0]

    resume['vol_down_per_flow_tot_%s' % name] = resume['vol_down_%s' % name] / resume['nb_down_flows_tot_%s' % name]
    resume['vol_down_per_flow_web_%s' % name] = resume['vol_down_web_%s' % name] / resume['nb_down_flows_web_%s' % name]
    resume['vol_down_per_flow_http_stream_%s' % name] = resume['vol_down_http_stream_%s' % name] / resume['nb_down_flows_http_stream_%s' % name]
    resume['vol_down_per_flow_other_stream_%s' % name] = resume['vol_down_other_stream_%s' % name] / resume['nb_down_flows_other_stream_%s' % name]

    resume['nb_clients_tot_%s' % name] = np.shape(np.unique(flows_down.client_id))[0]
    resume['nb_clients_web_%s' % name] = np.shape(np.unique(flows_down_web.client_id))[0]
    resume['nb_clients_http_stream_%s' % name] = np.shape(np.unique(flows_down_http_stream.client_id))[0]
    resume['nb_clients_other_stream_%s' % name] = np.shape(np.unique(flows_down_other_stream.client_id))[0]

    resume['vol_down_per_client_tot_%s' % name] = resume['vol_down_%s' % name] / resume['nb_clients_tot_%s' % name]
    resume['vol_down_per_client_web_%s' % name] = resume['vol_down_web_%s' % name] / resume['nb_clients_web_%s' % name]
    resume['vol_down_per_client_http_stream_%s' % name] = resume['vol_down_http_stream_%s' % name] / resume['nb_clients_http_stream_%s' % name]
    resume['vol_down_per_client_other_stream_%s' % name] = resume['vol_down_other_stream_%s' % name] / resume['nb_clients_other_stream_%s' % name]


    flows_down_1MB = flows_down.compress(flows_down.l3Bytes > 10**6 )
    flows_down_1MB_dscp = aggregate.aggregate(flows_down_1MB, 'dscp', 'l3Bytes', len)

    flows_down_1MB_web = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                 == INDEX_VALUES.DSCP_WEB )
    flows_down_1MB_http_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                         == INDEX_VALUES.DSCP_HTTP_STREAM )
    flows_down_1MB_other_stream = flows_down_1MB.compress(flows_down_1MB.dscp 
                                                          == INDEX_VALUES.DSCP_OTHER_STREAM )

    resume['nb_clients_1MB_tot_%s' % name] = np.shape(np.unique(flows_down_1MB.client_id))[0]
    resume['nb_clients_1MB_web_%s' % name] = np.shape(np.unique(flows_down_1MB_web.client_id))[0]
    resume['nb_clients_1MB_http_stream_%s' % name] = np.shape(np.unique(flows_down_1MB_http_stream.client_id))[0]
    resume['nb_clients_1MB_other_stream_%s' % name] = np.shape(np.unique(flows_down_1MB_other_stream.client_id))[0]

    resume['nb_down_flows_1MB_tot_%s' % name] = np.shape(flows_down_1MB)[0]
    resume['nb_down_flows_1MB_web_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                               'dscp', INDEX_VALUES.DSCP_WEB)
    resume['nb_down_flows_1MB_http_stream_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                       'dscp', INDEX_VALUES.DSCP_HTTP_STREAM)
    resume['nb_down_flows_1MB_other_stream_%s' % name] = extract_aggregated_field(flows_down_1MB_dscp, 
                                                                        'dscp', INDEX_VALUES.DSCP_OTHER_STREAM)
    
    return resume

예제 #12

0

파일 보기

파일: aggregate_spatial.py 프로젝트: Najah-lshanableh/lead-public

assessor = pd.read_sql(
    'select * from aux.assessor_summary b join aux.addresses a using(address)',
    engine)
acs = pd.read_sql('select geo_id2 as census_tract_id, * from aux.acs', engine)
wt = pd.read_sql('select * from aux.ward_tracts', engine)

for level in ['tracts', 'wards']:
    if level == 'wards':
        acs_level = wt.merge(acs, on='census_tract_id', how='left')
        index = 'ward_id'
        weight = acs_level['area']
    else:
        acs_level = acs
        index = 'census_tract_id'
        weight = None

    acs_ag = a.aggregate(acs_level, columns.acs, weight, index)
    buildings_ag = a.aggregate(buildings, columns.building, index=index)
    assessor_ag = a.aggregate(assessor, columns.assessor, index=index)

    acs_ag.columns = ['acs_' + c for c in acs_ag.columns]
    assessor_ag.columns = ['assessor_' + c for c in assessor_ag.columns]
    buildings_ag.columns = ['buildings_' + c for c in buildings_ag.columns]

    ag = acs_ag.join(assessor_ag, how='outer')
    ag = ag.join(buildings_ag, how='outer')

    # to_sql using wrong datatype when writing index as such. can specify dtype with pandas .15.2
    ag.reset_index(inplace=True)
    ag.to_sql(level, engine, if_exists='replace', schema='output', index=False)

예제 #13

0

파일 보기

파일: auto_latex_flows.py 프로젝트: LouisPlisso/analysis_tools

def fetch_data_general(in_flow, filtered=False):
    "Return a resume of interesting \
    flows characteristics."

#            ('nb_down_flows_1MB%s', 'Nb', '.4g'),
#            ('avg_vol_down_per_flow%s', 'Bytes', '.4g'),
#            ('avg_vol_down_per_client%s', 'Bytes', '.4g')]

    #new_flows = {}
    resume = {}
    for app in ('', '_WEB', '_HTTP_STREAM', '_OTHER_STREAM'):
        if app == '':
            new_flow = in_flow #['data%s' % app]
        else:
            dscp = get_dscp(app, in_flow, filtered=filtered)
            new_flow = in_flow.compress(in_flow['dscp'] == dscp)
            #['data%s' % app]
        resume['App: %s' % app] = ''
        new_flow = new_flow.view(np.recarray)
        new_flow_down = new_flow.compress(new_flow.direction
                == INDEX_VALUES.DOWN)
        new_flow_1MB = new_flow.compress(new_flow.l3Bytes > 10**6)
        new_flow_down_1MB = new_flow_down.compress(new_flow_down.l3Bytes
                > 10**6)
        vol_dir = aggregate.aggregate(new_flow, 'direction', 'l3Bytes', sum)
#        resume['vol_up%s' % app] = vol_dir[0][1]
#        resume['vol_down%s' % app] = vol_dir[1][1]
        try:
            resume['vol_up%s' % app] = vol_u = vol_dir[0][1]
        except IndexError:
            resume['vol_up%s' % app] = vol_u = float(0)
        try:
            resume['vol_down%s' % app] = vol_d = vol_dir[1][1]
        except IndexError:
            resume['vol_down%s' % app] = vol_d = float(0)
        resume['total_vol%s' % app] = vol_u + vol_d
        resume['nb_flows_down%s' % app] = nb_fl = len(new_flow_down)
        resume['nb_flows_down_1MB%s' % app] = nb_fl_1mb = len(new_flow_down_1MB)
        resume['ratio_nb_flows%s' % app] = int(100 * nb_fl_1mb / float(nb_fl)) \
                if (resume['nb_flows_down%s' % app] != 0) else 0
        resume['nb_client_down%s' % app] = nb_cl = len(np.unique(
            new_flow_down.client_id))
        resume['avg_vol_down_per_client%s' % app] = vol_d / nb_cl \
                if (nb_cl != 0) else 0
        resume['avg_vol_down_per_flow%s' % app] = vol_d / nb_fl \
                if (nb_fl != 0) else 0
        resume['avg_vol_up_per_client%s' % app] = vol_u / nb_cl \
                if (nb_cl != 0) else 0
        resume['avg_vol_up_per_flow%s' % app] = vol_u / nb_fl \
                if (nb_fl != 0) else 0
        resume['nb_client_down_1MB%s' % app] = len(np.unique(
            new_flow_down_1MB.client_id))
        resume['avg_nb_flows_per_client%s' % app] = nb_fl / float(nb_cl) \
                if nb_cl !=0 else 0
        resume['avg_nb_flows_1MB_per_client%s' % app] = nb_fl_1mb / float(nb_cl) \
                if nb_cl !=0 else 0
#    resume['nb_flow'] = len(flow)
#    resume['nb_client_1MB'] = len(np.unique(flows_1MB.client_id))
#    resume['nb_flow_1MB'] = len(flows_1MB)
#    resume['mean_flow_size'] = np.mean(flow.l3Bytes)
#    resume['median_flow_size'] = np.median(flow.l3Bytes)
#    resume['max_flow_size'] = np.int64(np.max(flow.l3Bytes))
#    resume['mean_flow_duration'] = np.mean(flow.duration)
#    resume['median_flow_duration'] = np.median(flow.duration)
#    resume['max_flow_duration'] = np.max(flow.duration)
#    resume['mean_flow_peak_rate'] = np.mean(80.0 * flow.peakRate)
#    resume['median_flow_peak_rate'] = np.median(80.0 * flow.peakRate)
#    resume['max_flow_peak_rate'] = np.max(80.0 * flow.peakRate)
#    mean_rate = [8*x['l3Bytes']/(1000.0*x['duration'])
#            for x in flow if x['duration']>0]
#    resume['mean_flow_mean_rate'] = np.mean(mean_rate)
#    resume['median_flow_mean_rate'] = np.median(mean_rate)
#    resume['max_flow_mean_rate'] = np.max(mean_rate)
#    mean_rate_1MB = [8*x['l3Bytes']/(1000.0*x['duration'])
#            for x in flow if x['duration']>0
#            and x['l3Bytes'] > 10**6]
#    resume['mean_flow_mean_rate_1MB'] = np.mean(mean_rate_1MB)
#    resume['median_flow_mean_rate_1MB'] = np.median(mean_rate_1MB)
#    resume['max_flow_mean_rate_1MB'] = np.max(mean_rate_1MB)
#    resume['mean_flow_AR'] = \
#            compute_AT.compute_AT(flow.initTime)[0]
#    resume['mean_flow_100_AR_per_cl'] = \
#            100 * resume['mean_flow_AR'] / resume['nb_client']
    return resume

예제 #14

0

파일 보기

파일: count_lines.py 프로젝트: ChristianGaertner/TwitchChatAnalyzing

import sys
from aggregate import aggregate


bucket_size = 1
line_filter = None

if len(sys.argv) > 1:
    bucket_size = int(sys.argv[1])

if len(sys.argv) > 2:
    line_filter = sys.argv[2]


res = aggregate(sys.stdin, bucket_size, line_filter)

for datetime, count in res:
    print(datetime.strftime("%s"), count, sep="\t")

예제 #15

0

파일 보기

파일: main.py 프로젝트: ankitkataria/Image-Denoising-Pipeline

def strollr2d_imagedenoising(data, param):
    """
    This is the entrypoint function for the imagedenoising. The input parameters
    are defined as

    data: A python object containing the image data, the structure contains two fields
    - noisy: a*b size gray-scale image matrix for denoising.
    - oracle (optional): a*b size gray-scale matrix as the ground truth for calculation
    of PSNR.

    param: Structure containing parameters for the algorithm.
    """
    try:
        noisy = data['noisy']
        oracle = data.get('oracle', None)

        sig = param['sig']
        dim = param['dim']
        # Kronecker product
        # dct(np.eye(8), axis=0) is the cosine transform of order 8
        W = np.kron(dct(np.eye(dim), axis=0, norm='ortho'), dct(np.eye(dim), axis=0, norm='ortho'))
        threshold = param['TLthr0'] * sig
        param['threshold'] = threshold

        thr = param['thr0'] * sig
        param['thr'] = thr

        print('[+] Parameters loaded')
        noisy, param = image_enlarge_tl(noisy, param)
        print('[+] Image Enlarged for TL ONLY')

        patchNoisy = image_patch(noisy, dim)
        print('[+] Image patch done')
        patches = patchNoisy

        # patchNoisy is a 2D numpy array
        numTensorPatch = patchNoisy.shape[1]
        param['numTensorPatch'] = numTensorPatch

        W, sparseCode, nonZeroTable = tl_approximation(patches, W, param)
        print('[+] Module TL approx done')

        nonZeroTable[nonZeroTable == 0] = param['zeroWeight']
        TLsparsityWeight = np.divide(1, nonZeroTable)
        blk_arr, _, blk_pSize = bm_fix(patches, param)
        print('[+] Module BM fix done')
        blk_arr = np.asarray(blk_arr)
        blk_pSize = np.asarray(blk_pSize)

        LRpatch, LRweights, LRrankWeight = lr_approximation(patches, blk_arr, blk_pSize, param)
        print('[+] Module LRapprox done')
        nonZerosLR = LRweights > 0
        LRrankWeight[nonZerosLR] = np.divide(LRrankWeight[nonZerosLR], LRweights[nonZerosLR])

        patchRecon = f1_reconstruction(sparseCode, W, LRpatch, LRweights, patches, param, TLsparsityWeight, LRrankWeight)
        print('[+] Module F1 Reconstruction done')
        Xr = aggregate(patchRecon, TLsparsityWeight, param)

        plt.imshow(Xr, cmap='gray', vmin=0, vmax=255)
        plt.show()

        psnrXr = PSNR(Xr - oracle)
        print('[+] PSNR value is : {}'.format(psnrXr))
        return Xr, psnrXr

    except KeyError as e:
        print('The parameter provided to strollr2d are not valid: {}'.format(e))
        sys.exit(1)

예제 #16

0

파일 보기

파일: plot_messages.py 프로젝트: ChristianGaertner/TwitchChatAnalyzing

import sys
import matplotlib.pyplot as plt
from aggregate import aggregate


bucket_size = 1

if len(sys.argv) > 1:
    bucket_size = int(sys.argv[1])


res = aggregate(sys.stdin, bucket_size)


plt.plot(*zip(*res))
plt.show()

예제 #17

0

파일 보기

def execution(result):
    # 获取起止日期
    result = result.split('~')
    date_begin = result[0]
    date_end = result[-1]
    date_end = datetime.datetime.strptime(date_end, "%Y-%m-%d")

    # 获取所有选中日期
    day_range = []
    date = datetime.datetime.strptime(date_begin, "%Y-%m-%d")
    while date <= date_end:
        day_range.append(date.strftime("%Y-%m-%d"))
        date = date + datetime.timedelta(days=1)
    # download the raw data
    date = datetime.datetime.strptime(date_begin, "%Y-%m-%d")
    web.update(date.strftime("%Y-%m-%d"))

    while (date + datetime.timedelta(days=7)) <= date_end:
        date = date + datetime.timedelta(days=7)
        web.update(date.strftime("%Y-%m-%d"))

    # 删除选中日期以外的文件
    path = '../Download/'
    word = 'BusLocation'
    Bus_dir = web.search_dir(path, word)
    for i in Bus_dir:
        web.del_file(i, day_range)
    word = 'Session'
    Session_dir = web.search_dir(path, word)
    for i in Session_dir:
        web.del_file(i, day_range)

    # service recognition
    word = 'BusLocation'
    Bus_dir = web.search_dir(path, word)
    for filedir_bus in Bus_dir:
        #### Modify folderpath ####
        aggregate.aggregate(filedir_bus)

    # ridership analysis
    database = "ridership"
    user = "******"
    password = "******"
    host = "localhost"
    port = "5432"
    ridership.ridership(database, user, password, host, port)

    # round trip time
    folderpath = '../output'
    database = "RTT"
    extractMobilityInfo.run_RTT(folderpath)
    upload.RTT_upload(database, user, password, host, port)

    # interarrival
    database = "inter_arrival"
    mobilityInterval.run_interarrival(folderpath)
    upload.inter_arrival_upload(database, user, password, host, port)

    web.clean()
    print('COMPLETED!')
    return

예제 #18

0

파일 보기

파일: prepare_week.py 프로젝트: ewmassey/nfl_draftkings

"""This module pulls data from each of the sources and generates the aggregate projections for the week."""
import sys

import aggregate
from scrapers import dailyfantasynerd, espn, nfl, numberfire, rotogrinders


if __name__ == "__main__":

    year = sys.argv[1]
    week = sys.argv[2]

    nfl.scrape(week, year)
    espn.scrape(week, year)
    numberfire.scrape(week, year)
    rotogrinders.scrape(week, year)
    dailyfantasynerd.scrape(week, year)

    aggregate.aggregate(week, year)

예제 #19

0

파일 보기

파일: plot_per_client.py 프로젝트: LouisPlisso/analysis_tools

def vol_per_client(data, as_list=None, as_excluded=None, on_list=False,
    field='l3Bytes', func=sum,
    output_path = 'rapport/client_ok', title='', prefix = ''#,
#    trace_list = ('ADSL_2008', 'FTTH_2008', 'ADSL_nov_2009', 'FTTH_nov_2009',
#        'ADSL_dec_2009', 'FTTH_dec_2009')
        ):
    """Plots volumes per clients according to AS match list:
    use * for all ASes.
    flag 'on_list' works only on AS_list (included AS) and AS_list elements are
    filters and names: see exemples
    Use as:
    data = tools.load_hdf5_data.load_h5_file('hdf5/lzf_data.h5')
    tools.plot_per_client.vol_per_client(data)
    tools.plot_per_client.vol_per_client(data,
        ('*', tools.INDEX_VALUES.AS_YOUTUBE))
    tools.plot_per_client.vol_per_client(data,
        as_excluded=tools.INDEX_VALUES.AS_YOUTUBE
        +tools.INDEX_VALUES.AS_YOUTUBE_EU,
        title='Other Streams', prefix='OTHER_')
    tools.plot_per_client.vol_per_client(data_streaming,
        as_list=((tools.INDEX_VALUES.AS_YOUTUBE, 'YOUTUBE'),
        (tools.INDEX_VALUES.AS_YOUTUBE_EU, 'YOUTUBE_EU')),
        title='YT and YT-EU Streams', prefix='YT_YT_EU_', on_list=True,
        output_path='rapport/client_ok')
    tools.plot_per_client.vol_per_client(data,
        as_list=((tools.INDEX_VALUES.AS_YOUTUBE, 'YOUTUBE'),
        (tools.INDEX_VALUES.AS_YOUTUBE_EU, 'YOUTUBE_EU'),
        (tools.INDEX_VALUES.AS_GOOGLE, 'GOOGLE')),
        title='YT and GOO Streams', prefix='YT_GOO', on_list=True,
        output_path='rapport/client_ok')
    """
    client_vol = {}
    # data collection
    args = []
    # TODO: AS list
    for trace in sorted([x for x in data.keys() if '_GVB' in x]):
        print 'process trace: ', trace
        filtered_data_dict = defaultdict(dict)
        if on_list:
            filtered_data_dict[trace] = filter_array_list(data, trace,
                    as_list, as_excluded)
        else:
            filtered_data_dict[trace][trace] = filter_array(data[trace],
                    'asBGP', as_list, as_excluded)
        for name in sorted(filtered_data_dict[trace]):
            filtered_data = filtered_data_dict[trace][name]
            # at least MIN_NB_FLOWS flows per data to plot
            if len(filtered_data) < MIN_NB_FLOWS:
                continue
            client_vol[name] = aggregate.aggregate(filtered_data,
                    'client_id', field, func)
            # construct plot args
            if as_list:
                title_name = format_as_title(name)
            else:
                title_name = format_title(name).rstrip(' GVB')
            args.append((title_name, client_vol[name]['aggregation']))
            # plot individual repartitions
            pylab.clf()
            cdfplot.repartplotdata(client_vol[name]['aggregation'],
                _title='%s Volume per Client for %s' % (title, trace),
                _ylabel='Percentage of Downstream Volume', _loc=0)
            cdfplot.setgraph_loglog()
            pylab.savefig(output_path
                + '/%s%s_repart_volume_per_client.pdf' % (prefix, trace))
    # plot CDF
    pylab.clf()
    cdfplot.cdfplotdataN(args, _title='%s Volume per Client' % title,
                         _xlabel='Downstream Volume in Bytes', _loc=0)
    pylab.savefig(output_path + '/%sCDF_volume_per_client.pdf' % prefix)

    # plot global repartition
    pylab.clf()
    cdfplot.repartplotdataN(args, _title='%s Volume per Client' % title,
            _ylabel='Percentage of Downstream Volume', _loc=0)
    pylab.savefig(output_path + '/%srepart_volume_per_client.pdf' % prefix)