Exemple #1
0
def anova_function(x_train, x_test, y_train, y_test, alpha, r_state, remove_nan):
    path = Path('./data/anova_{}_{}{}.npy'.format(r_state, alpha, "_no_nan" if remove_nan else ""))
    if path.is_file():
        result = np.load(path)
    else:
        result = []

        dfs_ad = x_train[y_train == 0, :]
        dfs_sq = x_train[y_train == 1, :]
        dfs_sane = x_train[y_train == 2, :]

        for column in range(x_train.shape[1]):
            F_statistic, p_value = stats.f_oneway(dfs_ad[:, column], dfs_sq[:, column], dfs_sane[:, column])
            if p_value >= alpha:  # il cg non esprime differrnziazione tra i due tumori
                result.append(0)
            else:
                result.append(1)  # il cg esprime differenziazione tra i due tumori
        result = np.asarray(result)
        np.save(path, result)

        if x_train[:, result == 1].shape[1] != 0 and x_train[:, result == 0].shape != 0:
            p.plot_histogram(x_train[:, result == 1][0], y_train,
                             "Anova Significative Feature Distribution", "{}_distribution_signif".format(cl.name), 3)
            p.plot_histogram(x_train[:, result == 0][0], y_train,
                             "Anova Non-significative Feature Distribution", "{}_distribution_nonsign".format(cl.name),
                             3)

    x_train = x_train[:, result == 1]
    x_test = x_test[:, result == 1]

    return x_train, x_test
Exemple #2
0
 def save_histogram_plots(self, serializer, cart_coords):
     config = serializer.read_config("config.yaml", path="stats")
     for file in glob.glob(os.path.join("stats", "hist*")):
         os.remove(file)
     for k in xrange(len(cart_coords)):                    
         X = np.array([cart_coords[k][i][0] for i in xrange(len(cart_coords[0]))])
         Y = np.array([cart_coords[k][i][1] for i in xrange(len(cart_coords[0]))])
         histogram_range = [[-3.1, 3.1], [-3.1, 3.1]]
         H, xedges, yedges = get_2d_histogram(X, Y, histogram_range, bins=config['num_bins'])        
         Plot.plot_histogram(H, xedges, yedges, save=self.save, path="stats", filename="hist"+ str(k) + ".png")
Exemple #3
0
def main():
    pregs = survey.Pregnancies()
    pregs.ReadRecords()
    lengths = pregnancy_length_list(pregs)
    hist = Hist()
    for l in lengths:
        hist.Incr(l)

    print 'The skewness of the pregnacy lengths is: ', Skewness(lengths)
    print 'The pearson skewness of the pregnacy lengths is: ', PearsonSkewness(
        lengths)
    plot_histogram(hist, xlabel='weeks', ylabel='number of births')
    def analyze(self, dataset, dataset_dir):
        data = self.data_class.load_dataset(dataset, train_size=100)

        (X, Y) = (data['x_train'], data['y_train'])
        print_score.print_breakdown(X, Y)

        if self.shall_plot:
            plot_scatter(X, Y, "%s-orig" % dataset, filename=os.path.join(dataset_dir, "%s-orig.png"  % dataset))
            plot_histogram(X, Y, "%s-hist" % dataset, filename=os.path.join(dataset_dir, "%s-hist.png" % dataset))

            pca = PCA()
            pca.fit(X)
            plot_PCA_variance(pca.explained_variance_ratio_ * 100, "%s-pca-#feature-vs-variance" % dataset, filename=os.path.join(dataset_dir, "%s-pca-variance-ratio" % dataset))
Exemple #5
0
def visualize(out, height, width):
    outX = [i.X for i in out["samples"]]
    outY = [height - i.Y for i in out["samples"]]

    plot.plot_histogram(data=outX,
                        bins=width,
                        title="X_histogram of posterior samples")
    plot.plot_histogram(data=outY,
                        bins=height,
                        title="Y_histogram of posterior samples")
    plot.show_scatterplot(outX,
                          outY,
                          title="Scatter plot of posterior samples",
                          height=height,
                          width=width)

    outsrcX = [src.X for src in out["src"]]
    outsrcY = [height - src.Y for src in out["src"]]
    plot.plot_histogram(data=outsrcX, bins=width, title="Xsrc")
    plot.plot_histogram(data=outsrcY, bins=height, title="Ysrc")
    plot.show_scatterplot(outsrcX,
                          outsrcY,
                          title="Scatter plot of sources",
                          height=height,
                          width=width)
Exemple #6
0
def fisher_function(x_train, x_test, y_train, y_test, r_state, best, n,
                    remove_nan):
    n = int(n)
    path = Path('./data/fisher_{}{}.npy'.format(
        r_state, "_no_nan" if remove_nan else ""))
    if path.is_file():
        result = np.load(path)
    else:
        result = []
        #numero_sq = int(sum(y_train))
        #numero_ad = y_train.shape[0] - numero_sq

        dfs_ad = x_train[y_train == 0, :]
        dfs_sq = x_train[y_train == 1, :]

        for column in range(x_train.shape[1]):
            value = (np.mean(dfs_ad[:, column]) -
                     np.mean(dfs_sq[:, column]))**2 / (
                         np.var(dfs_ad[:, column]) + np.var(dfs_sq[:, column]))
            result.append(value)
        best_feat = result.index(max(result))
        worste_feat = result.index(min(result))
        print(best_feat)
        print(worste_feat)

        result = np.asarray(result)
        np.save(path, result)

        p.plot_histogram(x_train[:, best_feat], y_train,
                         "Fisher Best Feature Distribution",
                         "{}_distribution_best".format(cl.name), 2)
        p.plot_histogram(x_train[:, worste_feat], y_train,
                         "Fisher Worst Feature Distribution",
                         "{}_distribution_worst".format(cl.name), 2)

    if best:
        result_max = result[np.argsort(result)[-n:]]
        indice = np.where(result >= min(result_max))[0]
    else:
        result_max = result[np.argsort(result)[:n]]
        indice = np.where(result <= max(result_max))[0]

    x_train = x_train[:, indice]
    x_test = x_test[:, indice]

    return x_train, x_test
def main():

    # step 1: load data
    from data_generator import load_data
    xy_train_df, forecast_dict = load_data()

    # step 2: obtain visual and statistical reports
    from plot import plot_df, plot_histogram
    plot_df(df = xy_train_df, title='history monthly data', subplots=True)
    plot_histogram(df= xy_train_df, title = 'history monthly histogram')

    # step 3: train model and make the forecast
    from xgboost_model import model_predict
    y_label = ['Actuals']
    x_label = list(set(xy_train_df.columns) - set(y_label))
    forecast_result = pd.DataFrame(columns=['Forecast', 'Predictive Power', 'MSE Upper', 'MSE Lower', 'Running Time'])

    for forecast_date in forecast_dict.keys():
        forecast_date_str = forecast_date.strftime('%d.%m.%Y')
        print('start forecasting on ' + forecast_date_str)
        forecast_start_time = datetime.now()
        saving_path = 'reports/model_'+ forecast_date_str
        x_forecast = forecast_dict[forecast_date].iloc[[-1]][x_label]
        # default bootstrap n_iterations=1000, confidence=95
        n_iterations = 500
        forecast_value, evaluation = model_predict(data_train = xy_train_df, x_forecast=x_forecast,
                                              saving_path=saving_path, n_iterations=n_iterations)
        time_delta = str(datetime.now() - forecast_start_time)
        forecast_result.loc[forecast_date] = [forecast_value, evaluation['Predictive Power'],
                                              evaluation['MSE CI Lower Bound'], evaluation['MSE CI Upper Bound'],
                                              time_delta]

        print('forecasting for {} is finished, took {}s for {} iterations'.format(forecast_date_str,
                                                                                   time_delta, n_iterations))

    # step 4: save and visualize forecast result
    print(forecast_result)
    forecast_result.to_csv('reports/forecast_result.csv', index=True)

    plot_forecast = True
    if plot_forecast is True:
        from plot import vertical_plot
        vertical_plot()
Exemple #8
0
def removeFeatures(x_train, x_test, cpg_r, chrms_pos, only_chrms_t, remove_nan,
                   y_train):
    features = None
    if remove_nan:
        path_h5 = Path("data/cpgs_nan.h5")
        if not path_h5.is_file():
            print("FILE ERROR")
            exit(0)
        with h5py.File(path_h5, 'r') as hf:
            features = np.array(hf["cpgs_nan"])

    with tqdm(total=len(cpg_r.keys())) as pbar:
        d = np.ones(0)
        for chrm in chrms_pos.keys():
            start = chrms_pos[chrm][0]
            end = chrms_pos[chrm][1]
            length = end - start
            if remove_nan:
                length -= len(features[start:end][features[start:end]])
                # end = start + length
            if len(cpg_r[chrm]) == 0:
                d = np.concatenate((d, np.zeros(length)))
            else:
                if only_chrms_t:
                    #### fix
                    d = np.concatenate((d, np.ones(length)))
                else:
                    a = np.asarray(cpg_r[chrm])
                    d = np.concatenate((d, a))
            pbar.update()

        if x_train[:, d == 1].shape[1] != 0 and x_train[:, d == 0].shape != 0:
            p.plot_histogram(x_train[:, d == 1][0], y_train,
                             "T-test Significative Feature Distribution",
                             "{}_distribution_signif".format(cl.name), 2)
            p.plot_histogram(x_train[:, d == 0][0], y_train,
                             "T-test Non-significative Feature Distribution",
                             "{}_distribution_nonsign".format(cl.name), 2)

        x_train = x_train[:, d == 1]
        x_test = x_test[:, d == 1]
    return x_train, x_test
Exemple #9
0
 def save_histogram_plots(self, serializer, cart_coords):
     config = serializer.read_config("config.yaml", path="stats")
     for file in glob.glob(os.path.join("stats", "hist*")):
         os.remove(file)
     for k in xrange(len(cart_coords)):
         X = np.array(
             [cart_coords[k][i][0] for i in xrange(len(cart_coords[0]))])
         Y = np.array(
             [cart_coords[k][i][1] for i in xrange(len(cart_coords[0]))])
         histogram_range = [[-3.1, 3.1], [-3.1, 3.1]]
         H, xedges, yedges = get_2d_histogram(X,
                                              Y,
                                              histogram_range,
                                              bins=config['num_bins'])
         Plot.plot_histogram(H,
                             xedges,
                             yedges,
                             save=self.save,
                             path="stats",
                             filename="hist" + str(k) + ".png")
def visualize(out, height, width):
    outX = [i.X for i in out["samples"]]
    outY = [height-i.Y for i in out["samples"]]

    plot.plot_histogram(data = outX, bins = width, title = "X_histogram of posterior samples")
    plot.plot_histogram(data = outY, bins = height, title = "Y_histogram of posterior samples")
    plot.show_scatterplot(outX,outY, title= "Scatter plot of posterior samples", height = height, width = width)

    outsrcX = [src.X for src in out["src"]]
    outsrcY = [height - src.Y for src in out["src"]]
    plot.plot_histogram(data = outsrcX, bins = width, title="Xsrc")
    plot.plot_histogram(data = outsrcY, bins = height, title="Ysrc")
    plot.show_scatterplot(outsrcX, outsrcY, title= "Scatter plot of sources", height = height, width = width)
Exemple #11
0
def sys_error(data, pvals, d, lattice, par=0, path="./plots/", absolute=False):
    """Calculates the statistical and systematic error of an np-array of 
    fit results on bootstrap samples of a quantity and the corresponding 
    p-values.

    Args:
        data: A numpy array with three axis. The first axis is the bootstrap 
              sample number, the second axis is the number of correlators, the
              third axis is the fit range index
        pvals: The p value indicating the quality of the fit.
        lattice: The name of the lattice, used for the output file.
        d: The total momentum of the reaction.
        par: which parameter to plot (second index of data arrays)
        path: path where the plots are saved
        absolute: calculate with the absolute values of data

    Returns:
        res: The weighted median value on the original data, see flag "boot"
        res_std: The standard deviation derived from the deviation of 
              medians on the bootstrapped data.
        res_syst: 1 sigma systematic uncertainty is the difference 
              res - 16%-quantile or 84%-quantile - res respectively
        weights: numpy array of the calculated weights for every bootstrap
        sample and fit range
    """
    # check the deepness of the list structure
    depth = lambda L: isinstance(L, list) and max(map(depth, L)) + 1
    deep = depth(data)
    if deep == 1:
        # initialize empty arrays
        data_weight = []
        res, res_std, res_sys = [], [], []
        # loop over principal correlators
        for i, p in enumerate(data):
            # append the necessary data arrays
            data_weight.append(np.zeros((p.shape[-1])))
            res.append(np.zeros(p.shape[0]))
            res_std.append(np.zeros((1, )))
            res_sys.append(np.zeros((2, )))

            # calculate the weight for the fit ranges using the standard
            # deviation and the p-values of the fit
            if absolute:
                data_std = np.std(np.fabs(p[:, par]))
            else:
                data_std = np.std(p[:, par])
            data_weight[i] = (1. - 2. * np.fabs(pvals[i][0] - 0.5) *
                              np.amin(data_std) / data_std)**2
            # draw data in histogram
            plotlabel = 'hist_%d' % i
            label = ["", "", "principal correlator"]
            if absolute:
                plt.plot_histogram(np.fabs(p[0, par]), data_weight[i], lattice,
                                   d, label, path, plotlabel)
            else:
                plt.plot_histogram(p[0, par], data_weight[i], lattice, d,
                                   label, path, plotlabel)
            # using the weights, calculate the median over all fit intervals
            # for every bootstrap sample.
            if absolute:
                for b in xrange(p.shape[0]):
                    res[i][b] = qlt.weighted_quantile(np.fabs(p[b, par]),
                                                      data_weight[i], 0.5)
            else:
                for b in xrange(p.shape[0]):
                    res[i][b] = qlt.weighted_quantile(p[b, par],
                                                      data_weight[i], 0.5)
            # the statistical error is the standard deviation of the medians
            # over the bootstrap samples.
            res_std[i] = np.std(res[i])
            # the systematic error is given by difference between the median
            # on the original data and the 16%- or 84%-quantile respectively
            if absolute:
                res_sys[i][0] = res[i][0] - qlt.weighted_quantile(
                    np.fabs(p[0, par]), data_weight[i], 0.16)
                res_sys[i][1] = qlt.weighted_quantile(np.fabs(
                    p[0, par]), data_weight[i], 0.84) - res[i][0]
            else:
                res_sys[i][0] = res[i][0] - qlt.weighted_quantile(
                    p[0, par], data_weight[i], 0.16)
                res_sys[i][1] = qlt.weighted_quantile(
                    p[0, par], data_weight[i], 0.84) - res[i][0]
            # keep only the median of the original data
            res[i] = res[i][0]
    elif deep == 2:
        # initialize empty arrays
        data_weight = []
        res, res_std, res_sys = [], [], []
        # loop over principal correlators
        for i, p in enumerate(data):
            data_weight.append([])
            res.append([])
            res_std.append([])
            res_sys.append([])
            for j, q in enumerate(p):
                # append the necessary data arrays
                data_weight[i].append(np.zeros(q.shape[-2:]))
                res[i].append(np.zeros(q.shape[0]))
                res_std[i].append(np.zeros((1, )))
                res_sys[i].append(np.zeros((2, )))

                # calculate the weight for the fit ranges using the standard
                # deviation and the p-values of the fit
                if absolute:
                    data_std = np.std(np.fabs(q[:, par]), axis=0)
                else:
                    data_std = np.std(q[:, par], axis=0)
                data_weight[i][j] = (1. - 2. * np.fabs(pvals[i][j][0] - 0.5) *
                                     np.amin(data_std) / data_std)**2
                # draw data in histogram
                plotlabel = 'hist_%d_%d' % (i, j)
                label = ["", "", "principal correlator"]
                if absolute:
                    plt.plot_histogram(
                        np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(),
                        lattice, d, label, path, plotlabel)
                else:
                    plt.plot_histogram(
                        np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(),
                        lattice, d, label, path, plotlabel)
                # using the weights, calculate the median over all fit intervals
                # for every bootstrap sample.
                if absolute:
                    for b in xrange(q.shape[0]):
                        res[i][j][b] = qlt.weighted_quantile(
                            np.fabs(q[b, par]).ravel(),
                            data_weight[i][j].ravel(), 0.5)
                else:
                    for b in xrange(q.shape[0]):
                        res[i][j][b] = qlt.weighted_quantile(
                            q[b, par].ravel(), data_weight[i][j].ravel(), 0.5)
                # the statistical error is the standard deviation of the medians
                # over the bootstrap samples.
                res_std[i][j] = np.std(res[i][j])
                # the systematic error is given by difference between the median
                # on the original data and the 16%- or 84%-quantile respectively
                if absolute:
                    res_sys[i][j][0] = res[i][j][0] - qlt.weighted_quantile(
                        np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(),
                        0.16)
                    res_sys[i][j][1] = qlt.weighted_quantile(
                        np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(),
                        0.84) - res[i][j][0]
                else:
                    res_sys[i][j][0] = res[i][j][0] - qlt.weighted_quantile(
                        q[0, par].ravel(), data_weight[i][j].ravel(), 0.16)
                    res_sys[i][j][1] = qlt.weighted_quantile(
                        q[0, par].ravel(), data_weight[i][j].ravel(),
                        0.84) - res[i][j][0]
                # keep only the median of the original data
                res[i][j] = res[i][j][0]
    else:
        print("made for lists of depth < 3")
        os.sys.exit(-10)
    return res, res_std, res_sys, data_weight
Exemple #12
0
    plot.plot_3d(ax, x_rng, y_rng, obj_func)
    plt.savefig(fig_dir + "/3d.png", dpi=200)

    plt.figure()
    plot.plot_obj_func(x_rng, y_rng, obj_func)
    plot.plot_trajs(x_rng, y_rng, trajs, [0])
    plt.title("Trajs")
    plt.savefig(fig_dir + "/trajs.png", dpi=200)

    plt.figure()
    plot.plot_obj_func(x_rng, y_rng, obj_func)
    # plot.plot_quiver(x_rng, y_rng, trajs)
    plot.plot_endpoint_counts(trajs)
    plt.title("Endpoints")
    plt.savefig(fig_dir + "/endpoints.png", dpi=200)

    plt.figure()
    plot.plot_histogram(x_rng, y_rng, trajs, 50)
    plot.plot_endpoint_counts(trajs)
    plt.title("Endpoints")
    plt.savefig(fig_dir + "/histogram.png", dpi=200)

    # plt.show()

    plt.close('all')

    # Line search
    plt.figure()
    plot.plot_line_search(line_search_factors, traj, obj_func)
    plt.title("Line search")
    plt.savefig(fig_dir + "/line_search.png", dpi=200)
Exemple #13
0
def makemetadataplot(folderdict,totreads,readsize):
    '''
    In PE readsize is twice
    '''
    numtypes,replicates,datadir=folderdict['data']
    samples=max(1,numtypes)
    metadatadir=folderdict['metadata']
    genealldatadict={}
    for i in range(1,samples+1):
        exp_file=open('%s/expression_T%02dS01.txt'%(metadatadir,i))
        genedict={}
        for lntxt in exp_file:
            ln=lntxt.rstrip('\n').split('\t')
            gene=ln[0].split(':')[0]
            txsize=int(ln[3])
            txexp=float(ln[4])
            numreads=int(ln[5])
            if gene in genedict:
                genedict[gene][0].append(txsize)
                genedict[gene][1].append(numreads)
                genedict[gene][2].append(txexp)
            else:
                genedict[gene]=[[txsize],[numreads],[txexp]]
        histbins=len(genedict)/5
        genedatadict={}
        for gene in genedict:
            numtx=len(genedict[gene][0])
            entropy=common.shannon_entropy(genedict[gene][2])
            rpkm=1000000.0/totreads*sum([genedict[gene][1][k]*1.0/genedict[gene][0][k] for k in range(numtx)])
            coverage=sum([genedict[gene][1][k]*1.0/genedict[gene][0][k] for k in range(numtx)])*readsize
            genedatadict[gene]=[numtx,entropy,rpkm,coverage]
            genealldatadict[gene]=genealldatadict.get(gene,[])+[coverage]
        plot.plot_histogram([genedatadict[gene][0] for gene in genedatadict],histbins,
                            'transcripts count','# genes','Number of Transcripts','%s/plt_T%02d_transcript_count.jpg'%(metadatadir,i))
        plot.plot_histogram([genedatadict[gene][1] for gene in genedatadict if genedatadict[gene][1]>1],histbins,
                            'entropy','# genes','Entropy Distribution','%s/plt_T%02d_entropy_dist.jpg'%(metadatadir,i))
        plot.plot_histogram([genedatadict[gene][2] for gene in genedatadict],histbins,
                            'RPKM','# genes','RPKM Distribution','%s/plt_T%02d_rpkm_dist.jpg'%(metadatadir,i),1)            
        plot.plot_histogram([genedatadict[gene][3] for gene in genedatadict],histbins,
                            'coverage','# genes','Coverage distribution','%s/plt_T%02d_coverage_dist.jpg'%(metadatadir,i),1)         
    if samples==2:
        jsd_file=open('%s/expression_jsd.txt'%metadatadir)
        for lntxt in jsd_file:
            ln=lntxt.rstrip('\n').split('\t')
            gene=ln[0]; jsd=float(ln[5])
            #print gene, jsd
            genealldatadict[gene].append(jsd)
        
        #print genealldatadict
        todellist=[]
        covlist1=[];covlist2=[];jsdlist=[]; mincovlist=[]
        for gene in genealldatadict:
            #print genealldatadict[gene]
            if min(genealldatadict[gene][0],genealldatadict[gene][1])>0.5 and genealldatadict[gene][2]>0.02:
                covlist1.append(genealldatadict[gene][0])
                covlist2.append(genealldatadict[gene][1])
                mincovlist.append(min(genealldatadict[gene][0],genealldatadict[gene][1]))
                jsdlist.append(genealldatadict[gene][2])
            else:
                todellist.append(gene)
        numofgenestr='%d of %d'%(len(genedatadict)-len(todellist),len(genedatadict))
        
        imagefilename='%s/plt_jsd_coverage1.pdf'%(metadatadir)
        plot.plotscatterwithhistogram(jsdlist,covlist1,'jsd','coverage1','cov1-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0)
        imagefilename='%s/plt_jsd_coverage2.pdf'%(metadatadir)
        plot.plotscatterwithhistogram(jsdlist,covlist2,'jsd','coverage2','cov2-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0)
        imagefilename='%s/plt_coverage1_coverage2.pdf'%(metadatadir)
        plot.plotscatterwithhistogram(covlist1,covlist2,'coverage1','coverage2','cov1-cov2\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=1)
        imagefilename='%s/plt_jsd_mincoverage.pdf'%(metadatadir)
        plot.plotscatterwithhistogram(jsdlist,mincovlist,'jsd','mincoverage','mincov-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0)
def _worker_lif_stats(uf):
    u = uf[0]
    rate = uf[1]

    fig = plt.figure(figsize=(16, 6))
    ax = [fig.add_subplot(4, 2, 1)]
    for i in xrange(1, 4):
        ax.append(fig.add_subplot(4, 2, i*2+1))

    th_f = th_lif_fi(u, tau_m, tref, xt)
    T = 1.
    if th_f > 0:
        T = max(T, tgt_outspikes/th_f)

    nspikes = max(10, 2.*T*rate)
    spks_in = make_poisson_spikes(rate, nspikes, rng)
    plot_spike_raster(spks_in, ax=ax[0], yticks=[])

    t, u_in = filter_spikes(dt, T, spks_in, tau_syn)
    u_in *= alpha
    mean_uin = np.mean(u_in[t > 5*tau_syn])
    plot_continuous(t, u_in, ax=ax[1],
                    axhline=mean_uin, axhlinep={'color': 'r'},
                    ylabel='syn', ylabelp={'fontsize': 16})

    spk_t, state = run_lifsoma(dt, u_in, tau_m, tref, xt, ret_state=True)
    plot_continuous(t, state, ax=ax[2],
                    ylabel='soma', ylabelp={'fontsize': 16})

    t, rate_out = filter_spikes(dt, xlim_T, spk_t, tau_syn)
    ss_idx = t > 5*tau_syn
    mean_rate_out = np.mean(rate_out[ss_idx])
    var_rate_out = np.var(rate_out[ss_idx])
    plot_continuous(t, rate_out, ax=ax[3],
                    axhline=mean_rate_out, axhlinep={'color': 'r'},
                    xlabel=r'$t$ (s)', xlabelp={'fontsize': 20},
                    ylabel=r'filtered out', ylabelp={'fontsize': 16})
    ax[3].axhline(th_f, color='r', linestyle=':')

    for a in ax[:-1]:
        plt.setp(a.get_xticklabels(), visible=False)
    match_xlims(ax, (0, xlim_T))
    ax[0].set_title(r'$\alpha=%.1e$,  $E[u]=%.2f$,  $f_{in}=%.1f$ Hz' %
                    (alpha, u, rate), fontsize=20)

    if len(spk_t) > 5:
        ax = fig.add_subplot(1, 2, 2)
        isi = np.diff(spk_t)
        histp = dict(bins=int(tgt_outspikes/10), normed=True, histtype='step')
        plot_histogram(
            isi, histp=histp, ax=ax,
            axvline=1./mean_rate_out,
            axvlinep={'color': 'r', 'label': 'observed mean'},
            xlabel='output isi (s)', xlabelp={'fontsize': 20},
            xlim=(min(isi), max(isi)),
            title=r'$%d$ spikes, $E[a(u)]=%.1f$, $Var(a(u))=%.1f$' %
            (len(spk_t), mean_rate_out, var_rate_out),
            titlep={'fontsize': 20})
        if th_f > 0:
            ax.axvline(1./th_f, color='r', linestyle=':',
                       label='theoretical mean')
        ax.legend(loc='upper right')
    sub_fname = fname
    if fname is not None:
        sub_fname = fname + '_alpha%.1e_u%.2f_f%.1f_.png' % (alpha, u, rate)
    save_close_fig(fig, sub_fname, close)
Exemple #15
0
def model_predict(data_train,
                  x_forecast,
                  saving_path,
                  n_iterations=1000,
                  confidence=95):

    if not os.path.exists(saving_path):
        os.mkdir(saving_path)
    '''
        with open(saving_path + '/model.sav', 'rb') as file1:
            best_model = pickle.load(file1)
        # there is problem to predict by reading saved model: feature names are mismatched
        prediction_result = pd.read_csv(saving_path+'/prediction_results.csv')
        predictive_power = prediction_result['Predictive Power'].iloc[0]
    '''

    x_label = x_forecast.columns
    y_label = ['Actuals']

    data_train.sort_index(inplace=True)
    fitness = pd.DataFrame()
    prediction_result = pd.DataFrame(index=data_train.index)
    evaluation = {}
    model_ls = []

    ## bootstrap: resample 80% of total data to train, and test on the unused data for performance - mse
    # this loop results in two dataframe: one for predictions and one for fitness
    for i in range(n_iterations):

        sample_size = int(data_train.shape[0] * 0.8)
        # resample shuffles data
        train_sample = resample(data_train,
                                n_samples=sample_size,
                                replace=True)
        test_sample = data_train[~data_train.index.isin(train_sample.index)]
        x_train = train_sample[x_label]
        y_train = train_sample[y_label]
        label_mean = y_train.mean().values[0]
        x_test = test_sample[x_label]
        y_test = test_sample[y_label]

        training_time_start = time.time()
        model = build_model(label_mean)
        model.fit(x_train, y_train)
        training_time = time.time() - training_time_start
        model_ls.append(model)

        predict = model.predict(x_test)
        iterate_col_str = 'iterate_%d' % i
        prediction = pd.DataFrame(predict,
                                  index=y_test.index,
                                  columns=[iterate_col_str])
        prediction_result = pd.concat([prediction_result, prediction], axis=1)
        fitness.loc[iterate_col_str,
                    'mse'] = mean_squared_error(y_test, predict)
        fitness.loc[iterate_col_str, 'training_time'] = training_time
        fitness.loc[iterate_col_str, 'estimators'] = model.best_estimator_

    ## evaluations
    prediction_result.sort_index(inplace=True)
    prediction_result_col = prediction_result.columns

    # confidence interval of prediction results
    for index in prediction_result.index:
        prediction_row = prediction_result.loc[index].dropna()
        if prediction_row.empty:
            continue
        CI_low = np.percentile(prediction_row, [(100 - confidence) / 2.])
        CI_up = np.percentile(prediction_row, [100 - (100 - confidence) / 2.])
        prediction_result.loc[index, 'CI_up'] = CI_up
        prediction_result.loc[index, 'CI_low'] = CI_low

    # fill nan with mean value in CI_up and CI_low
    prediction_result[['CI_up', 'CI_low']] = prediction_result[[
        'CI_up', 'CI_low'
    ]].fillna(prediction_result[['CI_up', 'CI_low']].mean())
    prediction_result['In_CI'] = np.where(
        (data_train[y_label].values < prediction_result[['CI_up']].values) &
        (data_train[y_label].values > prediction_result[['CI_low']].values), 1,
        0)

    predictive_power = prediction_result['In_CI'].sum(
    ) / prediction_result.shape[0]
    prediction_result.to_csv(saving_path + '/prediction_results.csv')
    evaluation['Predictive Power'] = predictive_power
    print('At %d confidence, the prediction score is ' % confidence,
          predictive_power)

    # plot predictions
    prediction_result['val_mean'] = prediction_result[
        prediction_result_col].mean(axis=1)
    lineplotCI(
        prediction_result.index.strftime('%Y-%m').values,
        prediction_result['val_mean'].values, data_train[y_label].values,
        prediction_result['CI_low'], prediction_result['CI_up'], 'Month',
        'Actuals', saving_path, 'Predicted Actuals')

    # confidence interval of mse
    plot_histogram(fitness[['mse']], folder=saving_path, title='mse_hit')
    mse_CI_low = np.percentile(fitness['mse'].values,
                               [(100 - confidence) / 2.])
    mse_CI_up = np.percentile(fitness['mse'].values,
                              [100 - (100 - confidence) / 2.])
    evaluation['MSE CI Lower Bound'] = mse_CI_low
    evaluation['MSE CI Upper Bound'] = mse_CI_up
    print('{}% confidence of mse is {} and {}'.format(confidence, mse_CI_low,
                                                      mse_CI_up))

    with open(saving_path + '/evaluation_dictionary.pkl', 'wb') as dict_file:
        pickle.dump(evaluation, dict_file, protocol=pickle.HIGHEST_PROTOCOL)

    # find smallest mse and use according model to forecast
    fitness_min = fitness[['mse', 'training_time']].min()
    fitness_idxmin = fitness[['mse', 'training_time']].idxmin()
    print('the smallest mse is', fitness_min['mse'], ', training took',
          fitness.loc[fitness_idxmin['mse'], 'training_time'], 's.')
    fitness.to_csv(saving_path + '/fitness.csv')

    # print('we choose the estimator with smallest validation mse.')
    best_model = model_ls[fitness.index.get_loc(fitness_idxmin['mse'])]
    with open(saving_path + '/model.sav', 'wb') as file:
        pickle.dump(best_model, file, protocol=pickle.HIGHEST_PROTOCOL)
    ''''
    # only applies to Booster models
    plot_importance(best_model)
    plt.savefig(saving_path+'/feature_importance.png')
    plt.show()
    '''

    forecast = best_model.predict(x_forecast)

    return forecast[0], evaluation
Exemple #16
0
def makemetadataplot(folderdict, totreads, readsize):
    '''
    In PE readsize is twice
    '''
    numtypes, replicates, datadir = folderdict['data']
    samples = max(1, numtypes)
    metadatadir = folderdict['metadata']
    genealldatadict = {}
    for i in range(1, samples + 1):
        exp_file = open('%s/expression_T%02dS01.txt' % (metadatadir, i))
        genedict = {}
        for lntxt in exp_file:
            ln = lntxt.rstrip('\n').split('\t')
            gene = ln[0].split(':')[0]
            txsize = int(ln[3])
            txexp = float(ln[4])
            numreads = int(ln[5])
            if gene in genedict:
                genedict[gene][0].append(txsize)
                genedict[gene][1].append(numreads)
                genedict[gene][2].append(txexp)
            else:
                genedict[gene] = [[txsize], [numreads], [txexp]]
        histbins = len(genedict) / 5
        genedatadict = {}
        for gene in genedict:
            numtx = len(genedict[gene][0])
            entropy = common.shannon_entropy(genedict[gene][2])
            rpkm = 1000000.0 / totreads * sum([
                genedict[gene][1][k] * 1.0 / genedict[gene][0][k]
                for k in range(numtx)
            ])
            coverage = sum([
                genedict[gene][1][k] * 1.0 / genedict[gene][0][k]
                for k in range(numtx)
            ]) * readsize
            genedatadict[gene] = [numtx, entropy, rpkm, coverage]
            genealldatadict[gene] = genealldatadict.get(gene, []) + [coverage]
        plot.plot_histogram(
            [genedatadict[gene][0] for gene in genedatadict], histbins,
            'transcripts count', '# genes', 'Number of Transcripts',
            '%s/plt_T%02d_transcript_count.jpg' % (metadatadir, i))
        plot.plot_histogram([
            genedatadict[gene][1]
            for gene in genedatadict if genedatadict[gene][1] > 1
        ], histbins, 'entropy', '# genes', 'Entropy Distribution',
                            '%s/plt_T%02d_entropy_dist.jpg' % (metadatadir, i))
        plot.plot_histogram([genedatadict[gene][2] for gene in genedatadict],
                            histbins, 'RPKM', '# genes', 'RPKM Distribution',
                            '%s/plt_T%02d_rpkm_dist.jpg' % (metadatadir, i), 1)
        plot.plot_histogram(
            [genedatadict[gene][3] for gene in genedatadict], histbins,
            'coverage', '# genes', 'Coverage distribution',
            '%s/plt_T%02d_coverage_dist.jpg' % (metadatadir, i), 1)
    if samples == 2:
        jsd_file = open('%s/expression_jsd.txt' % metadatadir)
        for lntxt in jsd_file:
            ln = lntxt.rstrip('\n').split('\t')
            gene = ln[0]
            jsd = float(ln[5])
            #print gene, jsd
            genealldatadict[gene].append(jsd)

        #print genealldatadict
        todellist = []
        covlist1 = []
        covlist2 = []
        jsdlist = []
        mincovlist = []
        for gene in genealldatadict:
            #print genealldatadict[gene]
            if min(genealldatadict[gene][0], genealldatadict[gene]
                   [1]) > 0.5 and genealldatadict[gene][2] > 0.02:
                covlist1.append(genealldatadict[gene][0])
                covlist2.append(genealldatadict[gene][1])
                mincovlist.append(
                    min(genealldatadict[gene][0], genealldatadict[gene][1]))
                jsdlist.append(genealldatadict[gene][2])
            else:
                todellist.append(gene)
        numofgenestr = '%d of %d' % (len(genedatadict) - len(todellist),
                                     len(genedatadict))

        imagefilename = '%s/plt_jsd_coverage1.pdf' % (metadatadir)
        plot.plotscatterwithhistogram(jsdlist,
                                      covlist1,
                                      'jsd',
                                      'coverage1',
                                      'cov1-jsd\n%s' % numofgenestr,
                                      imagefilename,
                                      markertup=('.', 5),
                                      logscaleyflg=1,
                                      logscalexflg=0)
        imagefilename = '%s/plt_jsd_coverage2.pdf' % (metadatadir)
        plot.plotscatterwithhistogram(jsdlist,
                                      covlist2,
                                      'jsd',
                                      'coverage2',
                                      'cov2-jsd\n%s' % numofgenestr,
                                      imagefilename,
                                      markertup=('.', 5),
                                      logscaleyflg=1,
                                      logscalexflg=0)
        imagefilename = '%s/plt_coverage1_coverage2.pdf' % (metadatadir)
        plot.plotscatterwithhistogram(covlist1,
                                      covlist2,
                                      'coverage1',
                                      'coverage2',
                                      'cov1-cov2\n%s' % numofgenestr,
                                      imagefilename,
                                      markertup=('.', 5),
                                      logscaleyflg=1,
                                      logscalexflg=1)
        imagefilename = '%s/plt_jsd_mincoverage.pdf' % (metadatadir)
        plot.plotscatterwithhistogram(jsdlist,
                                      mincovlist,
                                      'jsd',
                                      'mincoverage',
                                      'mincov-jsd\n%s' % numofgenestr,
                                      imagefilename,
                                      markertup=('.', 5),
                                      logscaleyflg=1,
                                      logscalexflg=0)
Exemple #17
0
import plot
import numpy as np
X = np.random.randn(400)
plot.plot_histogram(X, 10, "Histogram demo")
    first_estimations(distributions, thetas)
    strat_pairs = build_strategies_pairs()
    strat_comparisons = init_strat_comparisons()
    p = 0
    while p < len(strat_pairs):
        if len(strat_pairs) > 0:
            cp = strat_pairs[p]
            d1 = distributions[str(cp[0])]
            d2 = distributions[str(cp[1])]
            conclusion = fe.formation_evaluator(d1.prior, d2.prior)
            if conclusion == 0:
                var_d1 = d1.get_variance()
                var_d2 = d2.get_variance()
                if var_d1 < var_d2:
                    conclusion = 1
                else:
                    conclusion = -1
            strat_comparisons[cp[0]][cp[1]] = conclusion
            if conclusion == None:
                simulate_pairs(distributions, str(cp[0]), str(cp[1]), settings.N_ADDITIONAL_RUNS)
                p += 1
            else:
                del strat_pairs[p]
                if p!= 0:
                    p -= 1
    plot.plot_distributions_validation(distributions, settings.NB_THETA, i)
    fullfill_stratcomp(strat_comparisons)
    get_classification(distributions, thetas, strat_comparisons, wellclas, misclas)
distances, ratios = prepare_histogram(wellclas, misclas)
plot.plot_histogram(distances, ratios)