Ejemplo n.º 1
0
def prep_data_tests(data_tests,
                    values,
                    index,
                    columns,
                    reset_index=None,
                    prefix='Unknown_',
                    impute_grouping=['Wafer_ID'],
                    cont_func=['median'],
                    missing=True,
                    Threshold=20):
    data_pivot_tests = pivot_data(data_tests, values, index, columns,
                                  reset_index, prefix)
    data_tests_pivot_imputed, columns_drop, wafers_drop = drop_missing_values(
        data_pivot_tests, Threshold)
    data_tests_pivot_imputed = replace_missing_values(
        data_tests_pivot_imputed, impute_grouping,
        data_tests_pivot_imputed.columns, missing)
    data_tests_wafer_medians = stats(data_tests_pivot_imputed, ['Wafer_ID'],
                                     cont_func, '_wafer_')
    data_tests_fablot_medians = stats(data_tests_pivot_imputed,
                                      ['FabLot', 'Wafer_ID'], cont_func,
                                      '_fablot_')
    data_tests_stats = pd.merge(data_tests_wafer_medians.reset_index(),
                                data_tests_fablot_medians.reset_index(),
                                on=['Wafer_ID'],
                                how='inner').set_index(['FabLot', 'Wafer_ID'])
    data_tests.to_csv('data_model_tests.csv', index=True)
    data_tests_stats.to_csv('data_model_tests_stats.csv', index=True)
    return data_tests_pivot_imputed, columns_drop, wafers_drop, data_tests_stats
def main():

    #initializing

    #stop loss positions, the maximum long positions we can get
    #without certain constraints, you will long indefinites times
    #as long as the market condition triggers the signal
    #in a whipsaw condition, it is suicidal
    stls = 3
    ticker = 'NVDA'
    stdate = '2015-04-01'
    eddate = '2018-02-15'

    #slicer is used for plotting
    #a three year dataset with 750 data points would be too much
    slicer = 700

    #downloading data
    df = yf.download(ticker, start=stdate, end=eddate)

    trading_signals = signal_generation(df, heikin_ashi, stls)

    viz = trading_signals[slicer:]
    plot(viz, ticker)

    portfolio_details = portfolio(viz)
    profit(portfolio_details)

    stats(portfolio_details, trading_signals, stdate, eddate)
Ejemplo n.º 3
0
def analyzeTweets(keyword, option):
    # once in analyze mode, determine which analysis / visualization to do
    df = readSQL(keyword)
    df['datetime'] = pd.to_datetime(df['datetime'])
    n = len(df)

    if option == 'stats':
        stats(df, stripHash(keyword))
        return
    elif option == 'interval':
        interval(df, keyword, n)
        return
    elif option == 'line':
        line(df, keyword, n)
        return
    elif option == 'dist':
        dist(df, keyword, n)
        return
    elif option == 'scatter':
        scatter(df, keyword, n)
        return
    elif option == 'pie':
        pie(keyword)
        return
    elif option == 'map':
        sentMap(df, keyword)
        return
Ejemplo n.º 4
0
def eval_bias_variance_learner_cont(env, agent, sup, T, num_samples=1):
    s = env.reset()
    biases = []
    variances = []

    for i in range(num_samples):
        bias, variance, t = 0, 0, 0
        while t < T:
            a = agent.sample_action(s)  # \E_D(\pi^D_\theta(s))
            a_sup = sup.intended_action(s)  # \pi^*(s)
            # For variance, at each state sample actions from a random model and compare
            # to that from expected model
            a_ensemble_list = agent.intended_actions(s)
            ensemble_idx = np.random.randint(len(a_ensemble_list))
            a_ensemble = a_ensemble_list[ensemble_idx]  # \pi^D_\theta(s)

            # Need to evaluate bias/variance on learner's dist
            next_s, r, done, _ = env.step(a)
            s = next_s
            bias += np.sum((a - a_sup)**2)
            variance += np.sum((a - a_ensemble)**2)
            t += 1

            if done == True:
                break

        bias /= float(t)
        variance /= float(t)
        biases.append(bias)
        variances.append(variance)

    return stats(biases), stats(variances)
Ejemplo n.º 5
0
def strstats(v):
    if v.dtype == float:
        return "mean %f std %f min %f 10th %f 50th %f 90th %f max %f" % stats(v)
    elif v.dtype == int:
        return "mean %d std %d min %d 10th %d 50th %d 90th %d max %d" % stats(v)
    else:
        raise Exception("weird dtype %s"% v.dtype)
Ejemplo n.º 6
0
def main():

    #    parse_ftp()

    pop = 'uganda'
    coverage = '4x'
    dn = 'DNAse-seq'
    dn = sys.argv[-1]
    print(
        pop,
        coverage,
        dn,
    )

    l_vcfs = glob.glob('../pipeline/%s%s/out_UnifiedGenotyper/*.vcf' % (
        pop,
        coverage,
    ))
    l_vcfs_sorted = sort_nicely(l_vcfs)
    l_vcfs_sorted = [
        '../pipeline/uganda4x/out_UnifiedGenotyper/UnifiedGenotyper.2.2.vcf'
    ]
    ####    l_vcfs_sorted = l_vcfs_sorted[:10]
    sep = '/'
    #    l_vcfs_exome = glob.glob('/nfs/t149_1kg/phase1_v3/ALL.chr*.exome.1000GApr12.vcf')
    #    l_vcfs_exome_sorted = sort_nicely(l_vcfs_exome)
    #    l_vcfs = glob.glob('/nfs/t149_1kg/phase1_v3/ALL.chr*.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz')
    #    l_vcfs_sorted = sort_nicely(l_vcfs)
    #    sep = '|'

    ##    gnuplot.contour_plot(
    ##        path_dat='INDEL_%s.dat' %(dn),
    ##        bool_remove=False,
    ##        xlabel='dist_m_i_n (1000bp)',ylabel='INDEL length',
    ##        bool_log=False,
    ##        x1=0,x2=50,y1=1,y2=20,
    ##        )
    ##    stop

    d_lengths = loop_UG_out(l_vcfs_sorted, dn, sep)

    plot_contour(dn, d_lengths)

    s = ''
    for k1 in d_lengths.keys():
        for k2, v in d_lengths.items():
            s += '%s %s %s\n' % (k1, k2, v)
    fd = open('%s.dict', 'w')
    fd.write(s)
    fd.close()

    stats(
        d_lengths,
        dn,
    )

    ########    plot_length_distribution(pop,coverage,d_lengths,)

    return
Ejemplo n.º 7
0
    def print_entry(key, value):
        def stats(s, units=""):
            conf = "{:0.1f}({:0.2f}%)".format(s['ci']['abs'], s['ci']['perc'])
            return "{:8.1f}{} +/- {:15s}".format(s['average'], units, conf)

        print "{:>50s}  {}  {}".format(key,
                                       stats(value['time_stat'], units="ms"),
                                       stats(value['count_stat']))
Ejemplo n.º 8
0
 def print_entry(key, value):
   def stats(s, units=""):
     conf = "{:0.1f}({:0.2f}%)".format(s['ci']['abs'], s['ci']['perc'])
     return "{:8.1f}{} +/- {:15s}".format(s['average'], units, conf)
   print "{:>50s}  {}  {}".format(
     key,
     stats(value['time_stat'], units="ms"),
     stats(value['count_stat'])
   )
Ejemplo n.º 9
0
def printrestart(restart):
    ''' print quick summary info about the current state of the sampler. '''
    print "restart info: "
    print " current shape of chain: (nwalkers x niterations x ndim) ", np.shape(
        restart['chain'])
    print " autocorrelation lengths for each parameter: ", restart['acor']
    stats(restart['acor'])
    print " acceptance rate for each walker: ", restart['accept']
    stats(restart['accept'])
Ejemplo n.º 10
0
def probsplots(allprobs, fn, chain, burnin=0):
    import matplotlib
    matplotlib.use('agg')
    import matplotlib.pyplot as plt
    nwalker = np.shape(allprobs)[0]
    iters = np.shape(allprobs)[1]
    ndim = np.shape(chain)[2]

    # plot the trace of the probabilities for every walker.
    fig, ax = plt.subplots()
    for walker in range(nwalker):
        ax.plot(allprobs[walker, burnin:], alpha=.3, ls='--')
    plt.savefig(fn + '_probs.png')
    plt.close(fig)
    print "saved " + fn + '_probs.png'

    fig, ax = plt.subplots()
    kss = []
    kssthetas = np.zeros((ndim, iters - 1))
    # chain ~ walker x iter x dim
    for iter in range(iters - 1):
        kss.append(ksprob(allprobs[:, iter], allprobs[:, iters - 1]))
        for k in range(ndim):
            kssthetas[k, iter] = ksprob(chain[:, iter, k], chain[:, iters - 1,
                                                                 k])
    ax.scatter(range(iters - 1)[-25:], kss[-25:], s=30, marker='+')
    colors = [
        'k', 'r', 'green', 'orange', 'lightblue', 'grey', 'purple', 'pink',
        'yellow', 'blue', 'lightgreen', 'darkgreen'
    ] * 5
    for k in range(ndim):
        ax.plot(range(iters - 1)[-25:], kssthetas[k, -25:], color=colors[k])
    ax.set_xlabel('iter')
    ax.set_ylabel('ks prob vs last iteration')
    ax.set_ylim(-0.01, 1.02)
    plt.savefig(fn + '_ks.png')
    plt.close(fig)

    changes = np.zeros(nwalker)
    for walker in range(nwalker):
        for iter in range(iters - 1):
            if allprobs[walker, iter] != allprobs[walker, iter + 1]:
                changes[walker] += 1.0
    changes = changes / float(iters - 1.0)
    print "long-term acceptance fraction stats: "
    stats(changes)

    acor = np.zeros(nwalker)
    for walker in range(nwalker):
        acor[walker] = emcee.autocorr.integrated_time(
            allprobs[walker, burnin:], window=min([50, iters / 2]))
    print "acor stats: "
    stats(acor)
Ejemplo n.º 11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser('Report mean and standard deviation for the stream of numbers read from stdin',
                            formatter_class=ArgumentDefaultsHelpFormatter)
    args = parser.parse_args(argv[1:])

    stats = Stats()
    for line in sys.stdin:
        stats(float(line))
    print(stats)
Ejemplo n.º 12
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser(
        'Report mean and standard deviation for the stream of numbers read from stdin',
        formatter_class=ArgumentDefaultsHelpFormatter)
    args = parser.parse_args(argv[1:])

    stats = Stats()
    for line in sys.stdin:
        stats(float(line))
    print(stats)
Ejemplo n.º 13
0
def analyze_results(config, test_method, answer_path, list_of_req,
                    quality_bonus):

    full_data, accepted_sessions = data_cleaning(answer_path, test_method)
    n_workers = number_of_uniqe_workers(full_data)
    print(f"{n_workers} workers participated in this batch.")
    stats(answer_path)
    # votes_per_file, votes_per_condition = transform(accepted_sessions)
    if len(accepted_sessions) > 1:
        print(
            "Transforming data (the ones with 'accepted_and_use' ==1 --> group per clip"
        )
        use_condition_level = config.has_option('general', 'condition_pattern')
        votes_per_file, vote_per_condition = transform(
            test_method, accepted_sessions,
            config.has_option('general', 'condition_pattern'))
        votes_per_file_path = os.path.splitext(
            answer_path)[0] + '_votes_per_clip.csv'
        votes_per_cond_path = os.path.splitext(
            answer_path)[0] + '_votes_per_cond.csv'

        condition_keys = []
        if config.has_option('general', 'condition_pattern'):
            condition_keys = config['general']['condition_keys'].split(',')
            condition_keys.append('Unknown')
        headers = create_headers_for_per_file_report(test_method,
                                                     condition_keys)
        write_dict_as_csv(votes_per_file, votes_per_file_path, headers=headers)
        print(f'   Votes per files are saved in: {votes_per_file_path}')
        if use_condition_level:
            write_dict_as_csv(vote_per_condition, votes_per_cond_path)
            print(f'   Votes per files are saved in: {votes_per_cond_path}')

        bonus_file = os.path.splitext(
            answer_path)[0] + '_quantity_bonus_report.csv'
        quantity_bonus_df = calc_quantity_bonuses(full_data, list_of_req,
                                                  bonus_file)

        if quality_bonus:
            quality_bonus_path = os.path.splitext(
                answer_path)[0] + '_quality_bonus_report.csv'
            if 'all' not in list_of_req:
                quantity_bonus_df = calc_quantity_bonuses(
                    full_data, ['all'], None)
            if use_condition_level:
                votes_to_use = vote_per_condition
            else:
                votes_to_use = votes_per_file
            calc_quality_bonuses(quantity_bonus_df, accepted_sessions,
                                 votes_to_use, config, quality_bonus_path,
                                 n_workers, test_method, use_condition_level)
Ejemplo n.º 14
0
def writer(information, outfile, features, end_structure):

    dist = stats([int(item[1]) for item in information])
    length_stem = stats([item[2] for item in information])
    terminal_structure = stats([item[3] for item in information])

    ordered_ends = sorted(end_structure.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    with open(outfile, 'w') as outp:
        outp.write(
            "distance of nearest Structure (discounting base pairing in 5' terminal 50 nt, due to dubious nature of this interaction)\n"
        )
        outp.write("mean,min,max,median,std_dev\n")
        for item in dist[0:-2]:
            outp.write(str(item) + ",")
        outp.write(str(dist[-1]))
        outp.write("\n")
        outp.write("\n")

        outp.write(
            "length of base-pairing for nearest structure (ignores one nucleotide mismatch)\n"
        )
        outp.write("mean,min,max,median,std_dev\n")
        for item in length_stem[0:-2]:
            outp.write(str(item) + ",")
        outp.write(str(length_stem[-1]))
        outp.write("\n")
        outp.write("\n")

        outp.write(
            "how structured is the 3' terminal 50 nt (percent base paired,will discount base pairing in 5' terminal 50 nt, due to dubious nature of this interaction)\n"
        )
        outp.write("mean,min,max,median,std_dev\n")
        for item in terminal_structure[0:-2]:
            outp.write(str(item) + ",")
        outp.write(str(terminal_structure[-1]))
        outp.write("\n")
        outp.write("\n")

        q = 1
        outp.write(
            "3' terminal 50 nt ordered as a function of structure (top = most structured, bottom = least structured )\n"
        )
        outp.write("rank,percent_stranded,coordinate,gene_feature\n")
        for item in ordered_ends:
            outp.write(
                str(q) + ',' + str(item[1]) + ',' + str(item[0]) + ',' +
                str(features[item[0]]) + "\n")
            q += 1
    def get_lda_individual(self, topic_num, alpha, eta, text):
        """get individual lda model"""

        s = stats(360)
        post = s.get_data_by_day()

        nonOrg = post.loc[post['label'] == 1]
        lyrics = post.loc[post['label'] == 2]

        topic = LDATopicModel()
        c = Count_Vect()
        
        # change here to define lyrics or quote
        if text in 'lyrics':
            text = lyrics[['text', 'userid']]
            text['text'] = text['text'].apply(lambda x: c.remove_noise(str(x)))
            text = c.get_precocessed_text(text)
            text['text'] = text['text'].apply(lambda x: x.split())

            dictionary = gensim.corpora.Dictionary(text['text'])# generate dictionary
            bow_corpus = [dictionary.doc2bow(doc) for doc in text['text']]
            model, coherence = topic.get_lda_score_eval(dictionary, bow_corpus, topic_num, alpha, eta)

        elif text in 'quotes':
            text = nonOrg[['text', 'userid']]
            text['text'] = text['text'].apply(lambda x: c.remove_noise(str(x)))
            text = c.get_precocessed_text(text)
            text['text'] = text['text'].apply(lambda x: x.split())

            dictionary = gensim.corpora.Dictionary(text['text'])# generate dictionary
            bow_corpus = [dictionary.doc2bow(doc) for doc in text['text']]
            model, coherence = topic.get_lda_score_eval(dictionary, bow_corpus, topic_num, alpha, eta)
        
        # model evaluation 
        return model, coherence
Ejemplo n.º 16
0
Archivo: ENCODE.py Proyecto: grbot/tc9
def main():

#    parse_ftp()

    pop = 'uganda'
    coverage = '4x'
    dn = 'DNAse-seq'
    dn = sys.argv[-1]
    print(pop,coverage,dn,)

    l_vcfs = glob.glob('../pipeline/%s%s/out_UnifiedGenotyper/*.vcf' %(pop,coverage,))
    l_vcfs_sorted = sort_nicely(l_vcfs)
    l_vcfs_sorted = ['../pipeline/uganda4x/out_UnifiedGenotyper/UnifiedGenotyper.2.2.vcf']
####    l_vcfs_sorted = l_vcfs_sorted[:10]
    sep = '/'
#    l_vcfs_exome = glob.glob('/nfs/t149_1kg/phase1_v3/ALL.chr*.exome.1000GApr12.vcf')
#    l_vcfs_exome_sorted = sort_nicely(l_vcfs_exome)
#    l_vcfs = glob.glob('/nfs/t149_1kg/phase1_v3/ALL.chr*.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz')
#    l_vcfs_sorted = sort_nicely(l_vcfs)
#    sep = '|'

##    gnuplot.contour_plot(
##        path_dat='INDEL_%s.dat' %(dn),
##        bool_remove=False,
##        xlabel='dist_m_i_n (1000bp)',ylabel='INDEL length',
##        bool_log=False,
##        x1=0,x2=50,y1=1,y2=20,
##        )
##    stop

    d_lengths = loop_UG_out(l_vcfs_sorted,dn,sep)

    plot_contour(dn,d_lengths)

    s = ''
    for k1 in d_lengths.keys():
        for k2,v in d_lengths.items():
            s += '%s %s %s\n' %(k1,k2,v)
    fd = open('%s.dict','w')
    fd.write(s)
    fd.close()

    stats(d_lengths,dn,)

########    plot_length_distribution(pop,coverage,d_lengths,)

    return
  def regression(self, pre_var, data):
      s = stats(self.days) 
      #data = s.get_count_quote()
 
      X = data[pre_var]
      y = data["cesd_sum"]
      model = sm.OLS(y, X).fit()
      print(model.summary())
Ejemplo n.º 18
0
def answer(series, field, p, q, r=0, file="test.png"):
    df1 = series[field]
    plot_autocorrelations(df1)

    # %%
    df1_model = create_model(df1, p, q, r)
    stats(df1_model)
    # %%
    analise_model(df1_model)
    # %%

    plot_autocorrelations(df1_model.resid, "Residuals")
    # %%
    extract_resid_stats(df1_model)
    # %%
    plot_predictions(df1, df1_model, file)

    return df1_model
Ejemplo n.º 19
0
def eval_sim_err_statistics_cont(env, sup, T, num_samples=1):
    losses = []
    for i in range(num_samples):
        tmp_states, int_actions, taken_actions, _ = collect_traj(env, sup, T)
        int_actions = np.array(int_actions)
        taken_actions = np.array(taken_actions)
        errors = (int_actions - taken_actions)**2.0
        errors = np.sum(errors, axis=1)
        losses.append(np.mean(errors))
    return stats(losses)
Ejemplo n.º 20
0
def mmm():

    mmmArray = (re.split(',', modmedian.get()))
    mmmdesired_array = [int(numeric_string) for numeric_string in mmmArray]

    df = pd.DataFrame({
        " rating1": mmmdesired_array,
        "dummy": range(len(mmmdesired_array))
    })

    f, (ax_box,
        ax_hist) = plt.subplots(2,
                                sharex=True,
                                gridspec_kw={"height_ratios": (0.2, 1)})
    mean = df[' rating1'].mean()
    median = df[' rating1'].median()
    mode = df[' rating1'].mode().get_values()[0]

    sns.boxplot(df[" rating1"], ax=ax_box)
    ax_box.axvline(mean, color='r', linestyle='--')
    ax_box.axvline(median, color='g', linestyle='-')
    ax_box.axvline(mode, color='b', linestyle='-')

    sns.distplot(df[" rating1"], ax=ax_hist)
    ax_hist.axvline(mean, color='r', linestyle='--')
    ax_hist.axvline(median, color='g', linestyle='-')
    ax_hist.axvline(mode, color='b', linestyle='-')

    plt.legend({'Mean': mean, 'Median': median, 'Mode': mode})
    label = 'Mode=' + str(mode) + ', Median=' + str(median) + ', Mean=' + str(
        mean)

    ax_box.set(xlabel=label)
    if (stats(mmmdesired_array) == 0.0):
        ax_hist.set(xlabel="Symmetric (Zero Skewness)")
    elif (stats(mmmdesired_array) > 1):
        ax_hist.set(xlabel="Skewed to the Right (Positive Skewness)")

    else:
        ax_hist.set(xlabel="Skewed to the left (Negative Skewness)")

    plt.show()
Ejemplo n.º 21
0
def F(env, pi1, pi2, sup, T, num_samples=1):
    losses = []
    for i in range(num_samples):
        # collect trajectory with states visited and actions taken by agent
        tmp_states, _, _, _ = collect_traj(env, pi1, T)
        tmp_actions = np.array([pi2.intended_action(s) for s in tmp_states])
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        errors = 1.0 - np.mean(sup_actions == tmp_actions)
        # compute the mean error on that trajectory (may not be T samples since game ends early on failures)
        losses.append(np.mean(errors))

    # compute the mean and sem on averaged losses.
    return stats(losses)
Ejemplo n.º 22
0
def main():
    file = sys.argv[1]
    id = sys.argv[2]

    df = pd.read_csv(file)

    if id == 'average':
        df = df.drop(['student'], 1)
        df = df.reindex(sorted(df.columns), axis=1)
        dates = list(df)
        means = df[dates].mean()
        df = pd.DataFrame(means).transpose()
        stats(df)

    else:
        id = int(id)
        frame = df.loc[df['student'] == id]
        if frame.empty:
            print("Nezname student ID")
            return
        frame = frame.drop(['student'], 1)
        stats(frame)
Ejemplo n.º 23
0
def eval_agent_statistics_disc(env, agent, sup, T, num_samples=1):
    """
        evaluate in the given environment along the agent's distribution
        for T timesteps on num_samples
    """
    losses = []
    for i in range(num_samples):
        tmp_states, _, tmp_actions, _ = collect_traj(env, agent, T)
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        errors = (-(sup_actions == tmp_actions)).astype(int)
        losses.append(np.mean(errors))

    return stats(losses)
Ejemplo n.º 24
0
def eval_sup_statistics_disc(env, agent, sup, T, num_samples=1):
    """
        Evaluate on the supervisor's trajectory in the given env
        for T timesteps
    """
    losses = []
    for i in range(num_samples):
        tmp_states, _, _, _ = collect_traj(env, sup, T)
        tmp_actions = np.array([agent.intended_action(s) for s in tmp_states])
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        errors = (-(sup_actions == tmp_actions)).astype(int)
        losses.append(np.mean(errors))

    return stats(losses)
Ejemplo n.º 25
0
    def get_lda(self):
        """get lda topics"""

        s = stats(180)
        post = s.get_data_by_day()
        nonOrg = post.loc[post['label'] == 1]
        lyrics = nonOrg.loc[nonOrg['tag'] == 2]

        topic = LDATopicModel()
        c = Count_Vect()

        text = lyrics[['text', 'userid']]
        text['text'] = text['text'].apply(lambda x: c.remove_noise(str(x)))
        text = c.get_precocessed_text(text)
        topics, model = topic.get_lda_score(text, 10)
        return topics, model
Ejemplo n.º 26
0
def eval_agent_statistics_discrete(env, lnr, sup, T, num_samples=1):
    """
        evaluate loss in the given environment along the agent's distribution
        for T timesteps on num_samples
    """
    losses = []
    for i in range(num_samples):
        # collect trajectory with states visited and actions taken by agent
        tmp_states, _, tmp_actions, _ = collect_traj(env, lnr, T)
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        errors = 1.0 - np.mean(sup_actions == tmp_actions)
        # compute the mean error on that trajectory (may not be T samples since game ends early on failures)
        losses.append(np.mean(errors))

    # compute the mean and sem on averaged losses.
    return stats(losses)
Ejemplo n.º 27
0
def hierarchical(pause=False, **kwargs):
    """Binary networks with overlapping nodes and hierarchies

    This program is an implementation of the algorithm described in
    the paper'Direc ted, weighted and overlapping benchmark graphs for
    community detection algorithm s', written by Andrea Lancichinetti
    and Santo Fortunato. In particular, this program is to produce
    binary networks with overlapping nodes and hierarchies.

    -N              [number of nodes]
    -k              [average degree]
    -maxk           [maximum degree]
    -t1             [minus exponent for the degree sequence]
    -t2             [minus exponent for the community size distribution]
    -minc           [minimum for the micro community sizes]
    -maxc           [maximum for the micro community sizes]
    -on             [number of overlapping nodes]
    -om             [number of memberships of the overlapping nodes]
    -minC           [minimum for the macro community size]
    -maxC           [maximum for the macro community size]
    -mu1            [mixing parameter for the macro communities (see Readme file)]
    -mu2            [mixing parameter for the micro communities (see Readme file)]

    Example2:
    ./hbenchmark -f flags.dat
    ./hbenchmark -N 10000 -k 20 -maxk 50 -mu2 0.3 -minc 20 -maxc 50 -minC 100 -maxC 1000 -mu1 0.1
    """
    prog = _get_file('lfr_benchmarks/new/hierarchical_bench2_2/hbenchmark')
    args = [prog] + makeargs(kwargs)
    print "Arguments are: ", " ".join(args)

    with tmpdir_context(chdir=True, prefix="tmp-lfrbenchmark", dir=tmpbase):
        retcode = subprocess.call(args)
        assert retcode == 0

        g = networkx.Graph()
        for n, c in read_file('community_first_level.dat'):
            g.add_node(n - 1, microC=c - 1)
        for n, c in read_file('community_second_level.dat'):
            g.add_node(n - 1, macroC=c - 1)
        for n1, n2 in read_file('network.dat'):
            g.add_edge(n1 - 1, n2 - 1)
        g.graph['stats'] = stats(g)
        if pause:
            import fitz.interact
            fitz.interact.interact()
    return g
Ejemplo n.º 28
0
Archivo: lfr.py Proyecto: rkdarst/pcd
def hierarchical(pause=False, **kwargs):
    """Binary networks with overlapping nodes and hierarchies

    This program is an implementation of the algorithm described in
    the paper'Direc ted, weighted and overlapping benchmark graphs for
    community detection algorithm s', written by Andrea Lancichinetti
    and Santo Fortunato. In particular, this program is to produce
    binary networks with overlapping nodes and hierarchies.

    -N              [number of nodes]
    -k              [average degree]
    -maxk           [maximum degree]
    -t1             [minus exponent for the degree sequence]
    -t2             [minus exponent for the community size distribution]
    -minc           [minimum for the micro community sizes]
    -maxc           [maximum for the micro community sizes]
    -on             [number of overlapping nodes]
    -om             [number of memberships of the overlapping nodes]
    -minC           [minimum for the macro community size]
    -maxC           [maximum for the macro community size]
    -mu1            [mixing parameter for the macro communities (see Readme file)]
    -mu2            [mixing parameter for the micro communities (see Readme file)]

    Example2:
    ./hbenchmark -f flags.dat
    ./hbenchmark -N 10000 -k 20 -maxk 50 -mu2 0.3 -minc 20 -maxc 50 -minC 100 -maxC 1000 -mu1 0.1
    """
    prog = _get_file('lfr_benchmarks/new/hierarchical_bench2_2/hbenchmark')
    args = [ prog ] + makeargs(kwargs)
    print "Arguments are: ", " ".join(args)

    with tmpdir_context(chdir=True, prefix="tmp-lfrbenchmark", dir=tmpbase):
        retcode = subprocess.call(args)
        assert retcode == 0

        g = networkx.Graph()
        for n, c in read_file('community_first_level.dat'):
            g.add_node(n-1, microC=c-1)
        for n, c in read_file('community_second_level.dat'):
            g.add_node(n-1, macroC=c-1)
        for n1, n2 in read_file('network.dat'):
            g.add_edge(n1-1, n2-1)
        g.graph['stats'] = stats(g)
        if pause:
            import fitz.interact ; fitz.interact.interact()
    return g
Ejemplo n.º 29
0
def toHTML(logDirName):
    logDir = os.getcwd() + '/' + 'table' + '-' + logDirName
    out = open(logDirName + '.html', 'w')
    out.write('<html><body>')

    prefs = []

    for root, dirs, files in os.walk(logDir):
        for name in files:
            if re.compile(".*-.*.out$").match(name):
                prefs.append(name.split('-')[0])
    prefs = set(prefs)

    out.write('<table border=1>')
    out.write(
        "<tr><td>Experiment</td><td>xml</td><td>log</td><td>status</td><td>errors</td><td>warnings</td><td>yikes</td><td>previous</td></tr>\n"
    )
    for p in prefs:

        for root, dirs, files in os.walk(logDir):
            for name in files:
                if re.compile(p + "-.*.out$").match(name):
                    print name
                    out.write('<tr>')
                    out.write('<td><b>%s - %s</b></td>' %
                              (p, name.split('-')[1]))
                    outFile = ("%s/%s" % (root, name))
                    out.write('<td><a href=\"%s\">%s</a></td>' %
                              (outFile, outFile.split('/')[-1]))
                    out.write('<td><a href=\"%s/%s\">log</a></td>' %
                              (root, name.replace('.out', '.log')))
                    out.write(' %s ' % stats(root + '/' + name))
                    i = 0
                    out.write('<td>')

                    while '%s.%d' % (name, i) in files:
                        out.write(' :: <a href=\"%s/%s.%d\">%d</a>' %
                                  (root, name, i, i))
                        i = i + 1
                    out.write('</td>')
                    out.write('</tr>\n')

    out.write('</table>')
    out.write('<br>\n')
    out.write('</body></html>\n\n\n')
    out.close()
Ejemplo n.º 30
0
def get_elo(results):
    """
    "results" is an array of length 2*n+1 with aggregated frequences
    for n games."""
    results = LLRcalc.regularize(results)
    games, mu, var = stats(results)
    stdev = math.sqrt(var)

    # 95% confidence interval for mu
    mu_min = mu + Phi_inv(0.025) * stdev / math.sqrt(games)
    mu_max = mu + Phi_inv(0.975) * stdev / math.sqrt(games)

    el = elo(mu)
    elo95 = (elo(mu_max) - elo(mu_min)) / 2.0
    los = Phi((mu - 0.5) / (stdev / math.sqrt(games)))

    return el, elo95, los
Ejemplo n.º 31
0
def computeClippedImageStats(im, low=3, high=3, ignore=None):
    import collections

    im = im[~(np.isnan(im) | np.isinf(im))]
    if ignore is not None:
        for i in ignore:
            im = im[im != i]
    tmp = im
    if low != 0 and high != 0 and tmp.min() != tmp.max():
        _, low, upp = scipy.stats.sigmaclip(tmp, low=low, high=high)
        if not np.isnan(low) and not np.isnan(upp) and low != upp:
            tmp = im[(im > low) & (im < upp)]
    mean1 = np.nanmean(tmp)
    sig1 = np.nanstd(tmp)

    stats = collections.namedtuple('stats', 'mean stdev min max')
    return stats(mean=mean1, stdev=sig1, min=np.nanmin(im), max=np.nanmax(im))
Ejemplo n.º 32
0
def F2(env, pi1, pi2, sup, T, num_samples=1):
    losses = []
    for i in range(num_samples):
        # collect trajectory with states visited and actions taken by agent
        tmp_states, _, _, _ = collect_traj(env, pi1, T)
        tmp_actions = np.array([pi2.intended_action(s) for s in tmp_states])
        tmp_scores = np.array([pi2.decision_function(s) for s in tmp_states])
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        n = len(sup_actions)

        hinge = hinge_loss(sup_actions, tmp_scores)
        penalty = pi2.est.alpha * .5 * np.square(np.linalg.norm(pi2.est.coef_))
        print("hinge: " + str(hinge))
        print("penalty: " + str(penalty))
        errors = hinge / n + penalty

        # compute the mean error on that trajectory (may not be T samples since game ends early on failures)
        losses.append(np.mean(errors))

    # compute the mean and sem on averaged losses.
    return stats(losses)
Ejemplo n.º 33
0
def eval_sup_statistics_cont(env, agent, sup, T, num_samples=1):
    """
        Evaluate loss on the supervisor's trajectory in the given env
        for T timesteps
    """
    losses = []
    for i in range(num_samples):
        # collect states made by the supervisor (actions are sampled so not collected)
        tmp_states, _, _, _ = collect_traj(env, sup, T)

        # get inteded actions from the agent and supervisor
        tmp_actions = np.array([agent.intended_action(s) for s in tmp_states])
        sup_actions = np.array([sup.intended_action(s) for s in tmp_states])
        errors = (sup_actions - tmp_actions)**2.0

        # compute the mean error on that traj
        errors = np.sum(errors, axis=1)
        losses.append(np.mean(errors))

    # generate statistics, same as above
    return stats(losses)
Ejemplo n.º 34
0
def rows(infile, outfile, minstep, maxstep):
  out = open(outfile,'w')
  data = np.loadtxt(infile,skiprows=1)
  for x in range(1,data.shape[1]):
    out.write(stats(infile,x,minstep,maxstep) + ' ' + repr(x) + '-' + repr(x+1) + '\n')
Ejemplo n.º 35
0
p5 = portfolio_return(data, marketcapweights)[pd.datetime(1994, 1, 1):]

drag1 = p3 - p1
drag2 = p4 - p5


def stats(x):
    ann_mean = x.mean() * 12
    ann_std = x.std() * (12**.5)
    geo_mean = ann_mean - (ann_std**2) / 2.0
    sharpe = geo_mean / ann_std

    return (ann_mean, ann_std, geo_mean, sharpe)


print stats(p1)
print stats(p3)
print stats(p4)
print stats(p5)

toplot = pd.concat([p1, p3, p4, p5], axis=1)
toplot.columns = ["Optimised", "Handcraft", "Equal", "Market Cap"]
toplot.cumsum().plot()

show()

p1.cumsum().plot(color="black", ls="solid")
p3.cumsum().plot(color="gray", ls="solid")
p4.cumsum().plot(color="black", ls="dashed")
p5.cumsum().plot(color="gray", ls="dashed")
Ejemplo n.º 36
0
    meso_array = scipy.array(meso_list)
    thermo_array = scipy.array(thermo_list)
    meso_mean = scipy.mean(meso_array)
    meso_std = scipy.std(meso_array)
    thermo_mean = scipy.mean(thermo_array)
    thermo_std = scipy.std(thermo_array)
    p_val = scipy.stats.ttest_ind(thermo_array, meso_array)[1]
    s = '\nMm: ' + str(meso_mean) + '\nSTDm: ' + str(
        meso_std) + '\nMt: ' + str(thermo_mean) + '\nSTDt: ' + str(
            thermo_std) + '\nP: ' + str(p_val)
    return s


######get the stats and write to outfile
ofile = open('results5A-10k-P5050-thesisnorm-wnq.txt', 'w')
nip_stats = stats(meso_nips, thermo_nips)
s1 = 'Normalized N_ip: ' + nip_stats + '\n'
ofile.write(s1)
ratio_stats = stats(meso_ratios, thermo_ratios)
s2 = '\nRatio rep atr: ' + ratio_stats + '\n'
ofile.write(s2)
iso_stats = stats(meso_isos, thermo_isos)
s3 = '\nFraction isolated: ' + iso_stats + '\n'
ofile.write(s3)
branched_stats = stats(meso_branches, thermo_branches)
s4 = '\nFraction branched: ' + branched_stats + '\n'
ofile.write(s4)
meso_nip_data = str(meso_nips)
thermo_nip_data = str(thermo_nips)
meso_ratio_data = str(meso_ratios)
thermo_ratio_data = str(thermo_ratios)
Ejemplo n.º 37
0
        sum = s,
        lines = l,
        mean = m,
        variance = v,
        std_dev = std_dev,
        min = min_value,
        max = max_value,
        median = median_value,
        confidence = confidence,
        low_limit = ci[0],
        high_limit = ci[1]);

    return result    

if  __name__ == "__main__":
    parse_args()

    result = stats(sys.stdin, field, delimiter, skip, confidence, navalue)

    print_st("Field", result.field)
    print_st("Lines", result.lines)
    print_st("Mean", result.mean)
    print_st("Variance", result.variance)
    print_st("StdDev", result.std_dev)
    print_st("Sum", result.sum)
    print_st("Min", result.min)
    print_st("Max", result.max)
    print_st("Median", result.median)
    print_st("Confidence", result.confidence)
    print_st("Cnf.Itv.L", result.low_limit)
    print_st("Cnf.Itv.U", result.high_limit)
Ejemplo n.º 38
0
    #plt.show()



def stats(binding_data, proximity_data):
    n_bins = 50
    hist_b, bins_b = np.histogram(binding_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins))
    hist_p, bins_p = np.histogram(proximity_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins))
    #print hist_b, hist_p
    #print "Binding data> mean:%s, median:%s, std:%s" %(np.mean(binding_data), np.median(binding_data), np.std(binding_data))
    #print "Proximity data> mean:%s, median:%s, std:%s" %(np.mean(proximity_data), np.median(proximity_data), np.std(proximity_data))
    #print scipy.stats.spearmanr(hist_b, hist_p)
    #print bins_b, bins_p
    print ks_2samp(hist_b, hist_b)
    print ks_2samp(hist_b, hist_p)



mob_name = 'crystals_mob.txt'
aln_name = 'crystals_aln.csv'
#mob_name = 'models_mob.txt'
#aln_name = 'models_aln.csv'


mob_dict, aln_dict = get_data(mob_name, aln_name)
data = combine_data(mob_dict, aln_dict)
binding_data, proximity_data = mobility_stats(data)
print len(binding_data), len(proximity_data)
#plots(binding_data, proximity_data)
stats(binding_data, proximity_data)
Ejemplo n.º 39
0
    L = L.ravel()

    # obtain the indices to sort and unsort the flattened array
    i_sort = np.argsort(L)[::-1]
    i_unsort = np.argsort(i_sort)

    L_cumsum = L[i_sort].cumsum()
    L_cumsum /= L_cumsum[-1]
    
    xbins = 0.5 * (xbins[1:] + xbins[:-1])
    ybins = 0.5 * (ybins[1:] + ybins[:-1])

    return xbins, ybins, L_cumsum[i_unsort].reshape(shape)
    

Msin_median,Msin_std,Msin_range =stats(Msin_samples)
print('Msini med,std,range',Msin_median/(1.898*10**(30)),Msin_std,Msin_range/(1.898*10**(30)))
Period_median,Period_std,Period_range=stats(Period_samples)
print('Period med,std,range',Period_median,Period_std,Period_range)
Tzero_median,Tzero_std,Period_range=stats(Tzero_samples)
print('Tzero med,std,range',Tzero_median,Tzero_std,Period_range)

#Msini_pdf=mlab.normpdf(Msin_samples,Msin_median,Msin_std)
#Period_pdf=mlab.normpdf(Period_samples,Period_median,Period_std)
#Tzero_pdf=mlab.normpdf(Tzero_samples,Tzero_median,Tzero_std)

Msin,Period,sig_MP=compute_sigma_level(Msin_samples, Period_samples)
Msin,Tzero,sig_MT=compute_sigma_level(Msin_samples, Tzero_samples)
Tzero,Period,sig_TP=compute_sigma_level(Tzero_samples, Period_samples)

Abest=77.47156475290217
Ejemplo n.º 40
0
        data.append(float(item))

    s = sum(data)
    l = len(data)
    m = mean(data, s)
    v = variance(data, m)
    min_value = 0 if len(data) == 0 else min(data)
    max_value = 0 if len(data) == 0 else max(data)
    std_dev = v ** 0.5
    median_value = median(data)
    ci = mean_confidence_interval(data, confidence)

    return (field, l, m, v, std_dev, s, min_value, max_value, median_value, ci)

if  __name__ == "__main__":
    parse_args()

    field, l, m, v, std_dev, s, min_value, max_value, median_value, ci = stats(sys.stdin, field, delimiter, skip, confidence, navalue)

    print_st("Field", field)
    print_st("Lines", l)
    print_st("Mean", m)
    print_st("Variance", v)
    print_st("StdDev", std_dev)
    print_st("Sum", s)
    print_st("Min", min_value)
    print_st("Max", max_value)
    print_st("Median", median_value)
    print_st("Confidence", confidence)
    print_st("Cnf.Itv.L", ci[0])
    print_st("Cnf.Itv.U", ci[1])
Ejemplo n.º 41
0
Archivo: MassRich.py Proyecto: nkern/C4
	root = '/nfs/christoq_ls/nkern/C4/MassRich/TRUTH'

	# Load Data
	truth = fits.open(data_root+'/c4_cluster_truth_revH100_rev5.fits')[1].data
	truth['m200mean']*=1e10
	truth['m200crit']*=1e10

	Rich = truth['n10virm19']
	Mass = truth['m200crit']

	rich_low,rich_high,rich_step = 3.3,4.8,0.3
	richbins = np.arange(rich_low,rich_high,rich_step)
	mass_low,mass_high,mass_step = 14.25,15.15,0.2
	massbins = np.arange(mass_low,mass_high,mass_step)

	d = stats(Rich,Mass,rich_low,rich_high,rich_step,log10=False,fitline=True)
	globals().update(d)

	plot = True
	if plot == True:
		fig,ax = mp.subplots(1,2,figsize=(13,7))
		## ax[0] is MassRich
		p1, = ax[0].plot(X,Y,'ko',alpha=.2)
		ax[0].set_xlim(X.min()-.1,X.max()+.1)
		ax[0].set_ylim(Y.min()-.1,Y.max()+.5)
		ax[0].set_xlabel('log( N200 )',fontsize=15)
		ax[0].set_ylabel('log( Mass )',fontsize=15)
		ax[0].fill_between(richbins,Y.min()-.1,Y.max()+.5,alpha=.1)
		ax[0].plot(X,X*model.coef_+model.intercept_,'r')

		## ax[1] is Scatter
Ejemplo n.º 42
0
def summaryStats(df, filterColumns, returnColumns, regress=None, debug=False):
    if regress == None:
        regressionColumns = [None]
    else:
        regressionColumns = [None] + regress
    groupby={}
    data = df
    key = 'ALL'
    val = 'ALL'

    statColumns = ['return',
                    'total', 'win_ct', 'lose_ct', 'win_ratio', 'lose_ratio',
                    'return_med', 'return_avg', 'return_stddev', 'return_min', 'return_max']

    if regress != None:
        statColumns.reverse()
        statColumns.append('variable')
        statColumns.reverse()
        statColumns.append('slope')
        statColumns.append('intercept')
        statColumns.append('r')
        statColumns.append('r_low')
        statColumns.append('r_high')
        statColumns.append('2_tail_prob')
        statColumns.append('std_err')

    statColumns.reverse()
    statColumns.append('columnValue')
    statColumns.append('columnKey')
    statColumns.reverse()

    rows = []
    
    for returns in returnColumns:
        returnsData = data.dropna(subset=[returns])
        for var in regressionColumns:
            if var != None and len(returnsData)>0:
                if debug:
                    print key, val, var, returns
                regressionData = returnsData.dropna(subset=[var])
                tmpFilters = regressionData[var]
            else:
                regressionData = returnsData
                tmpFilters = []
                    
            tmpReturns = regressionData[returns]
            results = stats(var, tmpFilters, tmpReturns)
                   
            row = {'columnKey': key, 'columnValue': val, 'variable': var, 'return': returns}
            row.update(results)
                      
            rows.append(row)

    for col in filterColumns:
        g = data[col].unique()
        groupby[col] = filter( None, [v if pd.notnull(v) else None for v in g]) 
    
    keys = groupby.keys() 
    
    for key in keys:
        for val in groupby[key]:
            filteredData = data[data[key] == val]
            for returns in returnColumns:
                returnsData = filteredData.dropna(subset=[var])
                for var in regressionColumns:
                    if var != None:
                        regressionData = returnsData.dropna(subset=[var])
                        tmpFilters = regressionData[var]
                    else:
                        regressionData = returnsData
                        tmpFilters = []
    
                    tmpReturns = regressionData[returns]
                    results = stats(var, tmpFilters, tmpReturns)
    
                    row = {'columnKey': key, 'columnValue': val, 'variable': var, 'return': returns}
                    row.update(results)
    
                    rows.append(row)

    data = pd.DataFrame(rows, columns=statColumns)
    try:
        data['r'] = data['r'].real
        data['r_low'] = data['r_low'].real
        data['r_high'] = data['r_high'].real
    except:
        pass
    return data