Example #1
0
def apply_backtesting(bettor, param_grid, risk_factors, X, scores, odds, cv, random_state, n_runs, n_jobs):
    """Apply backtesting to evaluate bettor."""
    
    # Check random states
    random_states = check_random_states(random_state, n_runs)

    # Check arrays
    X = check_array(X, dtype=None, force_all_finite=False)
    normalized_scores = []
    for score in scores:
        normalized_scores.append(check_array(score, dtype=None, ensure_2d=False))
    odds = check_array(odds, dtype=None)

    # Extract parameters
    parameters = ParameterGrid(param_grid)

    # Run backtesting
    data = Parallel(n_jobs=n_jobs)(delayed(fit_bet)(bettor, params, risk_factors, random_state, X, normalized_scores, odds, train_indices, test_indices) 
           for params, random_state, (train_indices, test_indices) in tqdm(list(product(parameters, random_states, cv.split(X))), desc='Tasks'))
    
    # Combine data
    data = pd.concat(data, ignore_index=True)
    data = data.groupby(['parameters', 'risk_factor', 'experiment']).apply(lambda df: np.concatenate(df.yields.values)).reset_index()
    data[['coverage', 'mean_yield', 'std_yield']] = pd.DataFrame(data[0].apply(lambda yields: extract_yields_stats(yields)).values.tolist())
    
    # Calculate results
    results = data.drop(columns=['experiment', 0]).groupby(['parameters', 'risk_factor']).mean().reset_index()
    results['std_mean_yield'] = data.groupby(['parameters', 'risk_factor'])['mean_yield'].std().values
    results = results.sort_values('mean_yield', ascending=False).reset_index(drop=True)

    return results
Example #2
0
def get_stock_returns(stocks, start_date, end_date, freq):
    close_price = Parallel(n_jobs=10, backend='threading', verbose=5)(
        delayed(csf.get_stock_hist_bar)(code, freq,
                                        start_date=start_date,
                                        end_date=end_date,
                                        field=['date', 'close'])
        for code in stocks)
    for start_date, p in zip(stocks, close_price):
        p['tick'] = start_date
    close_price = pd.concat(close_price)
    close_price = close_price.dropna()
    # index.name原来为空
    close_price.index.name = 'dt'
    # 转成一个frame, index:dt, columns:tick
    close_price = (close_price.set_index('tick', append=True)
                   .to_panel()['close']
                   .sort_index()
                   .fillna(method='ffill')
                   )
    # 取每个周期末
    group_key = {'M': [close_price.index.year, close_price.index.month],
                 'W': [close_price.index.year, close_price.index.week],
                 'Q': [close_price.index.year, close_price.index.quarter]
                 }
    close_price = close_price.groupby(group_key[freq]).tail(1)
    returns = close_price.pct_change().shift(-1).dropna(axis=1, how='all')
    returns.index = returns.index.map(lambda dt: str(dt.date()))
    returns.index.name = 'date'
    returns = returns.unstack().to_frame()
    returns.columns = ['ret']
    returns = returns.swaplevel(0, 1).sort_index()
    returns.index.names = ['date', 'code']
    return returns
Example #3
0
    def mergeLinks(self, _inbed):
        GRPA = ['LINKS']
        GRPE = ['LINKS', 'SID']

        COL1 = [
            '#chrom', 'start_n', 'end_n', 'length_n', 'Order', 'fflag',
            'HTSites', 'query_name'
        ]
        COL2 = [
            '#chrom', 'start_n', 'end_n', 'Type', 'length_n', 'forword_n',
            'LINKS', 'Order'
        ]

        #Support = _inbed.loc[(_inbed.fflag.str.contains(';HTBREAKP')), COL1 + GRPE]\
        #            .groupby(by=GRPE, sort=False)\
        #            .apply(lambda x:self.statCircle(x)).reset_index()
        # reduce time
        Support = _inbed.loc[(_inbed.fflag.str.contains(';HTBREAKP')),
                             COL1 + GRPE].groupby(by=GRPE, sort=False)
        Support = Parallel(n_jobs=-1,
                           backend='loky')(delayed(self.statCircle)(_g)
                                           for _l, _g in Support)
        Support = pd.concat(Support, axis=1).T.infer_objects()

        Supgrpb = Support.groupby(by=['LINKS'], sort=True)
        Suplist = [
            Supgrpb['support_ID_num'].sum().to_frame('support_num'),
            Supgrpb['SID'].size().to_frame('support_ID_num'),
            Supgrpb['Cover'].mean().to_frame('Mean_Cover'),
            Supgrpb['Depth'].mean().to_frame('Mean_Depth'),
            Supgrpb['BPHTNum'].mean().to_frame('Mean_BPHTNum'),
            Supgrpb['SID'].apply(lambda x: x.str.cat(sep=';')).to_frame(
                'support_IDs'),
            Supgrpb['support_ID_num'].apply(lambda x: x.astype(str).str.cat(
                sep=';')).to_frame('support_read_num'),
            Supgrpb['Cover'].apply(
                lambda x: x.astype(str).str.cat(sep=';')).to_frame('Covers'),
            Supgrpb['Depth'].apply(
                lambda x: x.astype(str).str.cat(sep=';')).to_frame('Depths'),
            Supgrpb['BPHTNum'].apply(
                lambda x: x.astype(str).str.cat(sep=';')).to_frame('BPHTNums')
        ]
        Suplist = pd.concat(Suplist,
                            ignore_index=False,
                            join='outer',
                            sort=False,
                            axis=1).reset_index()
        del Supgrpb

        inbed = _inbed[COL2].drop_duplicates(keep='first').copy()
        inbed.rename(columns={
            'start_n': 'start',
            'end_n': 'end',
            'length_n': 'length',
            'forword_n': 'forword'
        },
                     inplace=True)
        inbed = inbed.merge(Suplist, on='LINKS', how='outer')
        return inbed, Support
Example #4
0
    def mapanytwo1(self,
                   indf,
                   maxdistance=500,
                   maxreg=True,
                   maxline=3000000,
                   oriant=False):
        def _splitmap(_inmap):
            inmap = _inmap.copy()
            for _n, _l in inmap.iterrows():
                S = inmap.start_n.between(_l.start - maxdistance,
                                          _l.start + maxdistance,
                                          inclusive=True)
                E = inmap.end_n.between(_l.end - maxdistance,
                                        _l.end + maxdistance,
                                        inclusive=True)
                inmap.loc[(S & E), 'start_n'] = inmap[(S & E)]['start'].min()
                inmap.loc[(S & E), 'end_n'] = inmap[(S & E)]['end'].max()
            return inmap

        sortN = ['#chrom', 'start', 'end', 'forword']
        mapsN = ['#chrom', 'start', 'end', 'forword', 'start_n', 'end_n']
        grpby = ['#chrom', 'forword'] if oriant else ['#chrom']

        indf = indf.copy().sort_values(by=sortN)
        indf[['start_n', 'end_n']] = indf[['start', 'end']]

        if indf.shape[0] > maxline:
            inmap = indf[mapsN].drop_duplicates(keep='first')
            inmap = Parallel(n_jobs=-1, backend='loky')(
                delayed(_splitmap)(_g)
                for _, _g in inmap.groupby(by=grpby, sort=False))
            inmap = pd.concat(inmap, axis=0)
            indf = indf.merge(inmap, on=sortN, how='left')
        else:
            indf = Parallel(n_jobs=-1, backend='loky')(
                delayed(_splitmap)(_g)
                for _, _g in indf.groupby(by=grpby, sort=False))
            indf = pd.concat(indf, axis=0)

        indf[['start_n', 'end_n']] = indf[['start_n', 'end_n']].astype(int)
        indf[['length_n']] = indf['end_n'] - indf['start_n'] + 1
        return indf
    def run(self):

        try:

            # Get path to all LAS files in directory
            files = os.listdir(self.dataDir)
            files = [
                os.path.join(self.dataDir, i) for i in files
                if i.endswith('.las')
            ]

            # Get LAS data for specified files
            lasData = Parallel(n_jobs=self.cpuCount)(
                delayed(self.readLasFiles)(f) for f in files)
            lasData = [i for i in lasData if i is not None]
            lasData = pd.concat(lasData, ignore_index=True)

            # Scale data
            if self.clipLog:
                p1 = np.percentile(lasData[self.logName], 1)
                p99 = np.percentile(lasData[self.logName], 99)
                lasData[self.logName] = np.clip(lasData[self.logName], p1, p99)
            stats = lasData[self.logName].describe()
            lasData[self.logName] -= stats.loc['mean']
            lasData[self.logName] /= stats.loc['std']
            stats = lasData[self.logName].describe()
            lasData[self.logName] -= stats.loc['min']
            lasData[self.logName] /= (stats.loc['max'] - stats.loc['min'])

            # Extract patches and save to disk
            wellGrps = lasData.groupby('Well')
            Parallel(n_jobs=self.cpuCount)(
                delayed(self.saveLasPatches)(wellGrps.get_group(i), i)
                for i in lasData['Well'].unique())

        except Exception as e:
            print('Something broke', e)
Example #6
0
def coloc_sim(data, radius=3, min_count=5, n_cores=1, copy=False):
    """Calculate pairwise gene colocalization similarity with the cross L function.

    Parameters
    ----------
    adata : AnnData
        Anndata formatted spatial data.
    radius : int
        Max radius to search for neighboring points, by default 3
    min_count : int
        Minimum points needed to be eligible for analysis.
    Returns
    -------
    adata : AnnData
        .uns['coloc_sim']: Pairwise gene colocalization similarity within each cell formatted as a long dataframe.
    """
    adata = data.copy() if copy else data

    # Filter points and counts by min_count
    counts = adata.to_df()

    # Helper function to apply per cell
    def cell_coloc_sim(p, g_density, name):

        # Get xy coordinates
        xy = p[["x", "y"]].values

        # Get neighbors within fixed outer_radius for every point
        nn = NearestNeighbors(radius=radius).fit(xy)
        distances, point_index = nn.radius_neighbors(xy, return_distance=True)

        # Enumerate point-wise gene labels
        gene_index = p["gene"].reset_index(
            drop=True).cat.remove_unused_categories()

        # Convert to adjacency list of points, no double counting
        neighbor_pairs = []
        for g1, neighbors, n_dists in zip(gene_index.values, point_index,
                                          distances):
            for g2, d in zip(neighbors, n_dists):
                neighbor_pairs.append([g1, g2, d])

        # Calculate pair-wise gene similarity
        neighbor_pairs = pd.DataFrame(neighbor_pairs,
                                      columns=["g1", "g2", "p_dist"])

        # Keep minimum distance to g2 point
        neighbor_pairs = neighbor_pairs.groupby(["g1", "g2"
                                                 ]).agg("min").reset_index()
        neighbor_pairs.columns = ["g1", "g2", "point_dist"]

        # Map to gene index
        neighbor_pairs["g2"] = neighbor_pairs["g2"].map(gene_index)

        # Count number of points within distance of increasing radius
        r_step = 0.5
        expected_counts = [
            lambda dists: (dists <= r).sum()
            for r in np.arange(r_step, radius + r_step, r_step)
        ]
        metrics = (neighbor_pairs.groupby(["g1", "g2"]).agg({
            "point_dist":
            expected_counts
        }).reset_index())

        # Colocalization metric: max of L_ij(r) for r <= radius
        g2_density = g_density.loc[metrics["g2"].tolist()].values
        metrics["sim"] = ((metrics["point_dist"].divide(
            g2_density * np.pi, axis=0)).pow(0.5).max(axis=1))
        metrics["cell"] = name

        # Ignore self colocalization
        # metrics = metrics.loc[metrics["g1"] != metrics["g2"]]

        return metrics[["cell", "g1", "g2", "sim"]]

    # Only keep genes >= min_count in each cell
    gene_densities = []
    counts.apply(lambda row: gene_densities.append(row[row >= min_count]),
                 axis=1)
    # Calculate point density per gene per cell
    gene_densities /= adata.obs["cell_area"]
    gene_densities = gene_densities.values

    # TODO dask
    cell_metrics = Parallel(n_jobs=n_cores)(delayed(cell_coloc_sim)(
        get_points(adata,
                   cells=g_density.name,
                   genes=g_density.index.tolist(),
                   asgeo=True),
        g_density,
        g_density.name,
    ) for g_density in tqdm(gene_densities))

    cell_metrics = pd.concat(cell_metrics)
    cell_metrics.columns = cell_metrics.columns.get_level_values(0)

    # Make symmetric (Lij = Lji)
    cell_metrics["pair"] = cell_metrics.apply(
        lambda row: "-".join(sorted([row["g1"], row["g2"]])), axis=1)
    cell_symmetric = cell_metrics.groupby(["cell", "pair"]).mean()

    # Retain gene pair names
    cell_symmetric = (cell_metrics.set_index(["cell", "pair"]).drop(
        "sim", axis=1).join(cell_symmetric).reset_index())

    # Aggregate across cells
    coloc_agg = cell_symmetric.groupby(["pair"])["sim"].mean().to_frame()
    coloc_agg = (coloc_agg.join(
        cell_symmetric.set_index("pair").drop(
            ["sim", "cell"], axis=1)).reset_index().drop_duplicates())

    # Save coloc similarity
    cell_metrics[["cell", "g1", "g2", "pair"]].astype("category", copy=False)
    coloc_agg[["g1", "g2", "pair"]].astype("category", copy=False)
    adata.uns["coloc_sim"] = cell_metrics
    adata.uns["coloc_sim_agg"] = coloc_agg

    return adata if copy else None
Example #7
0
    def typeCat(self,
                indf,
                dropcigarover=True,
                dropneighbdup=True,
                minalignlenght=100):
        dropcigarover = self.dropcigarover  #True
        dropneighbdup = self.dropneighbdup  #True
        GRPBY = ['SID', 'query_name']

        self.log.CI('start droping overlap of mapping region: ' + self.inid)
        # drop min align lenght
        indf.loc[(np.abs(indf.cigarreg.str[1] -
                         indf.cigarreg.str[0]) < self.minalignlenght - 1),
                 'fflag'] = 'LOWALIGN'
        LOWA = indf[(indf.fflag == 'LOWALIGN')]
        indf = indf[(indf.fflag != 'LOWALIGN')]

        # dropcigarover
        if dropcigarover:
            indf = Parallel(n_jobs=-1, backend='threading')(
                delayed(self.dropCigarOver)(_g)
                for _, _g in indf.groupby(by=GRPBY, sort=False))
            indf = pd.concat(indf, axis=0, sort=False)
        OVER = indf[(indf.fflag == 'OVER')]
        indf = indf[(indf.fflag != 'OVER')]

        # maxbeddistance
        self.log.CI('start computing maximal distance of mapping region: ' +
                    self.inid)
        indf = Parallel(n_jobs=-1, backend='threading')(
            delayed(self.maxBedDistance)(_g)
            for _, _g in indf.groupby(by=GRPBY, sort=False))
        indf = pd.concat(indf, axis=0, sort=False)
        DIST = indf[(indf.fflag != 'HTDIST')]
        indf = indf[(indf.fflag == 'HTDIST')]

        # mergeNeighb
        self.log.CI('start merging neighbour duplcations of mapping region: ' +
                    self.inid)
        indf = Parallel(n_jobs=-1, backend='threading')(
            delayed(self.mergeNeighb)(_g)
            for _, _g in indf.groupby(by=GRPBY, sort=False))
        indf = pd.concat(indf, axis=0, sort=False)
        DUPL = indf[(indf.fflag.str.contains('DUPLIC', regex=False))]
        indf = indf[~(indf.fflag.str.contains('DUPLIC', regex=False))]

        # markEcDNA
        self.log.CI('start marking and merging head-to-tail mapping region: ' +
                    self.inid)
        indf = Parallel(n_jobs=-1, backend='threading')(
            delayed(self.markKeep)(_g)
            for _, _g in indf.groupby(by=GRPBY, sort=False))
        indf = pd.concat(indf, axis=0, sort=False)
        LINE = indf[~((indf.fflag.str.contains('EcDNA'))
                      & ~(indf.fflag.str.contains('MISS')))]
        indf = indf[((indf.fflag.str.contains('EcDNA'))
                     & ~(indf.fflag.str.contains('MISS')))]

        # mergeHeadTail
        self.log.CI('start  merging head-to-tail mapping region: ' + self.inid)
        indf = Parallel(n_jobs=-1, backend='threading')(
            delayed(self.mergeHeadTail)(_g)
            for _, _g in indf.groupby(by=GRPBY, sort=False))
        indf = pd.concat(indf, axis=0, sort=False)

        # concat
        MARK = pd.concat([LOWA, OVER, DIST, DUPL, LINE, indf],
                         axis=0,
                         sort=False)
        del (
            LOWA,
            OVER,
            DIST,
            DUPL,
            LINE,
        )

        # headtailregion
        self.log.CI('start adding heat/tail site to a new column: ' +
                    self.inid)
        KEEP = indf.merge(indf.groupby(by=GRPBY, sort=False)\
                            .apply(lambda x:
                                x.loc[(x.fflag.str.contains(';HEAD|;TAIL')), ['start','end']].values.tolist())\
                            .to_frame(name='HTSites').reset_index(), on=GRPBY)
        #KEEP.loc[~KEEP.fflag.str.contains('HTBREAKP'), 'HTSites'] = ''
        KEEP = KEEP[~(KEEP.fflag.str.contains('HEAD|TAIL', regex=True))]

        MARK.to_csv(self.arg.outpre + '.Mark', sep='\t', index=False)
        KEEP.to_csv(self.arg.outpre + '.Keep', sep='\t', index=False)
        del (MARK, KEEP, indf)
Example #8
0
	plt.plot(data.mjd_short, data.flux_5);
	ax.set_title('{} - {}'.format(s_ind, len(data)))
	
fig.suptitle(obj_id);
fig.tight_layout()

# %%

# passband_diff histogram
plt.hist([dataset_proc.passband_diff], bins='auto');


# %%

# Number of observations histogram
lens = [ max(g.n_obs)+1 for name,g in dataset_proc.groupby('object_id')]
plt.hist(lens, bins='auto');

for l in set(lens):
	l_lens = len([ li for li in lens if li==l])
	print('{}: {:.2f}% - {}'.format(l, l_lens/len(lens), l_lens))
	
	
# %%
	
# Length of each observation
lens = [ len(g) for name,g in dataset_proc.groupby(['object_id', 'n_obs'])]
plt.hist(lens, bins='auto');


Example #9
0
    filename = Path("C:/Users/Dustin/Desktop/datafile.din")
    if not filename.exists():
        with open(str(filename), "wb") as handle:
            for data in tqdm(response.iter_content()):
                handle.write(data)

    # Parse file
    cc_file = Parallel(n_jobs=NUM_CORES)(delayed(validate_line)(line) for line in open(str(filename), 'r').readlines())
    cc_file = [x for x in cc_file if x is not None]

    # Convert to dataframe
    cc_file = pd.DataFrame(cc_file, columns=['op_type', 'register'])

    # Bar plot of frequency by register
    print('Plotting bar plot...')
    plot_data = cc_file.groupby('register').size().plot()
    plt.show(block=True)

    print('Frequency by op_type:')
    print(cc_file.groupby('op_type').size())

    # B
    #######
    A_int = generate_matrix_int(348, 200)
    A_dbl = generate_matrix_dbl(348, 200)

    B_int = generate_matrix_int(200, 140)
    B_dbl = generate_matrix_dbl(200, 140)

    row_int = timeit.repeat('mult_row(A_int, B_int)', 'from __main__ import mult_row, A_int, B_int', number=1, repeat=10)
    row_dbl = timeit.repeat('mult_row(A_dbl, B_dbl)', 'from __main__ import mult_row, A_dbl, B_dbl', number=1, repeat=10)
def run_seir_varying_control_simulations(var, R0):

    # convert to two type:
    index_map = [np.arange(0, 12), np.arange(12, 15)]
    # todo:  may be computationally smart to move this outside
    par = h.get_two_type_params(index_map=index_map, R0=R0)
    #par["stay_duration"] = 6
    par["tmax"] = 365*10 # todo: check this is long enough
    par["hosp_cap"] = 17800
    # par["hosp_rate"] = 0.0368

    hc_range = np.arange(2200, 60000, 1200)
    hc_range[28] = 35600 # add exactly double 17800 to list
    sd_range = np.arange(1, 25, 1)

    if var == "hosp_cap":
        var_range = hc_range

    if var == "stay_duration":
        var_range = sd_range



    start_time = time.time()
    # tt = pd.concat([get_two_type_df(hosp_cap=h) for h in
    #                 np.arange(0.1, 1.1, 0.1)*1e5])
    sim_df = Parallel(n_jobs=n_cores)(delayed(two_type_df_wrapper)(par, var, v)
                                      for v in var_range)
    sim_df = pd.concat(sim_df)
    sim_df = sim_df.sort_values([var, "time"])
    print("--- %s seconds ---" % (time.time() - start_time))

    def get_ctl_dur(df):
        df_f = df[df["control"] > 0]
        return df_f.index.max() - df_f.index.min()

    def get_final_full_R0(df):
        return df["Reff_full"].iloc[-1]

    def get_final_sgl_R0(df):
        return df["Reff_single"].iloc[-1]

    def get_S0_start(df):
        df_f = df[df["control"] > 0]
        return df_f.loc[df_f.index.min(), "S0"]

    out_g = sim_df.groupby(var)
    out_stats = pd.DataFrame({"control_duration": out_g.apply(get_ctl_dur),
                              "final_full_R0": out_g.apply(get_final_full_R0),
                              "final_single_R0": out_g.apply(get_final_sgl_R0)})

    out_stats["full_hi"] = out_stats["final_full_R0"] < 1
    out_stats["S0"] = out_g.apply(get_S0_start)

    def mt_wrapper(x):
        par_x = {**par, **{var: x.name, "S0": x["S0"]}}
        return get_min_time_to_herd_immunity_approx(par_x)

    out_stats["approx_time_to_hi"] = out_stats.apply(mt_wrapper, axis=1)

    return sim_df, out_stats
Example #11
0
def ocsvm_rules_experiments_pipeline(df_mat, numerical_cols, categorical_cols,
                                     cluster_algorithm, method, rules_used,
                                     dct_params, path_folder, file_template,
                                     store_intermediate=False,
                                     plot_fig=False):
    """
    
    Parameters
    ----------
    df_mat : TYPE
        DESCRIPTION.
    numerical_cols : TYPE
        DESCRIPTION.
    categorical_cols : TYPE
        DESCRIPTION.
    cluster_algorithm : TYPE
        DESCRIPTION.
    method : TYPE
        DESCRIPTION.
    rules_used : TYPE
        DESCRIPTION.
    dct_params : TYPE
        DESCRIPTION.
    path_folder : TYPE
        DESCRIPTION.
    file_template : TYPE
        DESCRIPTION.
    plot_fig : TYPE, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    None.

    """
    
    print("Beginning process...")
    if rules_used == "all" or rules_used == "inliers":
        print("\n\n")
        print("*"*100)
        print("Obtaining Rules for Inliers...")
        print("*"*100)
        use_inverse = False
        file_name = file_naming_ocsvm(file_template=file_template,
                                      cluster_algorithm=cluster_algorithm,
                                      method=method,
                                      use_inverse=use_inverse)
    
        #### Obtain Rules [Inliers]
        if not store_intermediate:
            # Rules
            print("Fitting OCSVM model...")
            clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat,
                                                                    numerical_cols=numerical_cols,
                                                                    categorical_cols=categorical_cols,
                                                                    clustering_algorithm=cluster_algorithm,
                                                                    method=method,
                                                                    use_inverse=use_inverse,
                                                                    dct_params=dct_params,
                                                                    store_intermediate=store_intermediate,
                                                                    path_save_model=path_folder)
            df_all = df_result
            
            df_no = df_anomalies[df_anomalies['predictions'] == 1]
            df_no = df_no.drop_duplicates()
            print(
                "Max different values (inliers) : {0} | Rules extracted {1}".format(
                    len(df_no), len(df_all)))
            print("Saving rules...")
            df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False)
            df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False)
            
        else:
            try:
                df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv')
                df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv')
                clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb"))
                sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb"))
            except:
                print("File not found! Fitting OCSVM model...")
                clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat,
                                                                        numerical_cols=numerical_cols,
                                                                        categorical_cols=categorical_cols,
                                                                        clustering_algorithm=cluster_algorithm,
                                                                        method=method,
                                                                        use_inverse=use_inverse,
                                                                        dct_params=dct_params,
                                                                        store_intermediate=store_intermediate,
                                                                        path_save_model=path_folder)
                df_all = df_result
                
                df_no = df_anomalies[df_anomalies['predictions'] == 1]
                df_no = df_no.drop_duplicates()
                print(
                    "Max different values (inliers) : {0} | Rules extracted {1}".format(
                        len(df_no), len(df_all)))
                print("Saving rules...")
                df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False)
                df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False)
            
        
        # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code
        if cluster_algorithm == "kprototypes":
            feature_cols = list(set(numerical_cols + categorical_cols))
            cat_additional = []
        else:
            feature_cols = numerical_cols
            cat_additional = categorical_cols
                
        df_anomalies = df_anomalies
        df_rules = df_all
        inliers_used=True
        clustering_algorithm=cluster_algorithm
        path=path_folder
        file_name=file_name
        
        df_rules['n_inliers_included'] = 0
        df_rules['n_outliers_included'] = 0
        n_inliers = len(df_anomalies[df_anomalies['predictions']==1])
        n_outliers = len(df_anomalies[df_anomalies['predictions']==-1])
        n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols))
        
        print("Checking inliers inside rules...")
        df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows())
        df_check = pd.concat([x[x['check']>0] for x in df_check])
        df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index()
        df_temp =  df_rules[['n_inliers_included']].reset_index()
        df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0)
        df_rules['n_inliers_included'] = df_check
        
        print("Checking outliers inside rules...")
        df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows())
        df_check = pd.concat([x[x['check']>0] for x in df_check])
        df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index()
        df_temp =  df_rules[['n_inliers_included']].reset_index()
        df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0)
        df_rules['n_outliers_included'] = df_check
        
        # Check how many datapoints are included with the rules with Precision=1
        print("Checking inliers/outliers inside hypercubes with Precision=1...")
        n_inliers_p1 = 0
        n_inliers_p0 = 0
        n_outliers_p1 = 0
        n_outliers_p0 = 0
        n_inliers = len(df_anomalies[df_anomalies['predictions']==1])
        n_outliers = len(df_anomalies[df_anomalies['predictions']==-1])
        
        def wrapper_precision_check(data_point):
            df_rules['check'] = check_datapoint_inside(data_point,
                                                       df_rules,
                                                       feature_cols,
                                                       cat_additional)['check']
            n_inliers_p1 = 0
            n_inliers_p0 = 0
            n_outliers_p1 = 0
            n_outliers_p0 = 0
        
            if inliers_used:
                # If inlier
                if data_point['predictions']==1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_outliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p1 += 1
            else:
                # If outlier
                if data_point['predictions']==-1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_inliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p1 += 1
                        
            return {'n_inliers_p0':n_inliers_p0,
                    'n_inliers_p1':n_inliers_p1,
                    'n_outliers_p0':n_outliers_p0,
                    'n_outliers_p1':n_outliers_p1}
                        
                        
        dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows())
        df_out = pd.DataFrame(dct_out).sum()
        
        for i, data_point in df_anomalies.iterrows():
            df_rules['check'] = check_datapoint_inside(data_point,
                                                       df_rules,
                                                       feature_cols,
                                                       cat_additional)['check']
            if inliers_used:
                # If inlier
                if data_point['predictions']==1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_outliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p1 += 1
            else:
                # If outlier
                if data_point['predictions']==-1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_inliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p1 += 1
        
        if inliers_used:
            df_rules['n_inliers'] = n_inliers
            df_rules['n_inliers_p0'] = df_out['n_inliers_p0']
            df_rules['n_inliers_p1'] = df_out['n_inliers_p1']
            try:
                del df_rules['check']
            except:
                pass
            path_aux = "inliers"
        else:
            df_rules['n_outliers_p1'] = df_out['n_outliers_p1']
            df_rules['n_outliers_p0'] = df_out['n_outliers_p0']
            df_rules['n_outliers'] = n_outliers
            try:
                del df_rules['check']
            except:
                pass
            path_aux = "outliers"
        
        # Save to CSV
        df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path,
                                                                                    file_name=file_name,
                                                                                    type_r = path_aux),
                                    index=False)
        
        # Use only pure rules
        df_rules = df_rules[df_rules["n_outliers_included"]==0]
        
        print("Obtaining metrics...")
        df_rules = rule_overlapping_score(df_rules, df_anomalies,
                                          feature_cols, cat_additional)
        
        df_rules = check_stability(df_anomalies, df_rules, clf,
                                    feature_cols, cat_additional,
                                    using_inliers=True)
        
        # Saving rules obtained
        print("Saving rules...")
        df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False)
    
        if plot_fig:
            #### Plot Rules [Inliers]
            print("Plotting rules for inliers...")
            df_rules = df_rules.copy()
            df_rules = df_rules.drop_duplicates().reset_index(drop=True)
            plot_2D(df_rules,
                    df_anomalies,
                    folder = path_folder,
                    path_name=file_name)
    
    
    if rules_used == "all" or rules_used == "outliers":
        print("\n\n")
        print("*"*100)
        print("Obtaining Rules for Outliers...")
        print("*"*100)
        
        #### Obtain Rules [Outliers]
        use_inverse = True
        file_name = file_naming_ocsvm(file_template=file_template,
                                      cluster_algorithm=cluster_algorithm,
                                      method=method,
                                      use_inverse=use_inverse) 
        
        if not store_intermediate:
            # Rules
            print("Fitting OCSVM model...")
            clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat,
                                                                    numerical_cols=numerical_cols,
                                                                    categorical_cols=categorical_cols,
                                                                    clustering_algorithm=cluster_algorithm,
                                                                    method=method,
                                                                    use_inverse=use_inverse,
                                                                    dct_params=dct_params,
                                                                    store_intermediate=False,
                                                                    path_save_model=path_folder)
            df_all = df_result
            
            df_no = df_anomalies[df_anomalies['predictions'] == 1]
            df_no = df_no.drop_duplicates()
            print(
                "Max different values (outliers) : {0} | Rules extracted {1}".format(
                    len(df_no), len(df_all)))
            print("Saving rules...")
            df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False)
            df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False)
            
        else:
            try:
                df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv')
                df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv')
                clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb"))
                sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb"))
            except:
                print("File not found! Fitting OCSVM model...")
                clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat,
                                                                        numerical_cols=numerical_cols,
                                                                        categorical_cols=categorical_cols,
                                                                        clustering_algorithm=cluster_algorithm,
                                                                        method=method,
                                                                        use_inverse=use_inverse,
                                                                        dct_params=dct_params,
                                                                        store_intermediate=store_intermediate,
                                                                        path_save_model=path_folder)
                df_all = df_result
                
                df_no = df_anomalies[df_anomalies['predictions'] == 1]
                df_no = df_no.drop_duplicates()
                print(
                    "Max different values (outliers) : {0} | Rules extracted {1}".format(
                        len(df_no), len(df_all)))
                print("Saving rules...")
                df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False)
                df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False)
        
        # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code
        if cluster_algorithm == "kprototypes":
            feature_cols = list(set(numerical_cols + categorical_cols))
            cat_additional = []
        else:
            feature_cols = numerical_cols
            cat_additional = categorical_cols
        
        # Complete Rules
        print("Checking outliers inside hypercubes...") 
        df_anomalies['predictions'] = df_anomalies['predictions']*-1
        df_anomalies['distances'] = df_anomalies['distances']*-1
        
        df_anomalies = df_anomalies
        df_rules = df_all
        inliers_used=False
        clustering_algorithm=cluster_algorithm
        path=path_folder
        file_name=file_name
        
        df_rules['n_inliers_included'] = 0
        df_rules['n_outliers_included'] = 0
        n_inliers = len(df_anomalies[df_anomalies['predictions']==1])
        n_outliers = len(df_anomalies[df_anomalies['predictions']==-1])
        n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols))
        
        print("Checking inliers inside rules...")
        df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows())
        df_check = pd.concat([x[x['check']>0] for x in df_check])
        df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index()
        df_temp =  df_rules[['n_inliers_included']].reset_index()
        df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0)
        df_rules['n_inliers_included'] = df_check
        
        print("Checking outliers inside rules...")
        df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows())
        df_check = pd.concat([x[x['check']>0] for x in df_check])
        df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index()
        df_temp =  df_rules[['n_inliers_included']].reset_index()
        df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0)
        df_rules['n_outliers_included'] = df_check
        
        # Check how many datapoints are included with the rules with Precision=1
        print("Checking inliers/outliers inside hypercubes with Precision=1...")
        n_inliers_p1 = 0
        n_inliers_p0 = 0
        n_outliers_p1 = 0
        n_outliers_p0 = 0
        n_inliers = len(df_anomalies[df_anomalies['predictions']==1])
        n_outliers = len(df_anomalies[df_anomalies['predictions']==-1])
        
        def wrapper_precision_check(data_point):
            df_rules['check'] = check_datapoint_inside(data_point,
                                                       df_rules,
                                                       feature_cols,
                                                       cat_additional)['check']
            n_inliers_p1 = 0
            n_inliers_p0 = 0
            n_outliers_p1 = 0
            n_outliers_p0 = 0
        
            if inliers_used:
                # If inlier
                if data_point['predictions']==1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_outliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p1 += 1
            else:
                # If outlier
                if data_point['predictions']==-1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_inliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p1 += 1
                        
            return {'n_inliers_p0':n_inliers_p0,
                    'n_inliers_p1':n_inliers_p1,
                    'n_outliers_p0':n_outliers_p0,
                    'n_outliers_p1':n_outliers_p1}
                        
                        
        dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows())
        df_out = pd.DataFrame(dct_out).sum()
        
        for i, data_point in df_anomalies.iterrows():
            df_rules['check'] = check_datapoint_inside(data_point,
                                                       df_rules,
                                                       feature_cols,
                                                       cat_additional)['check']
            if inliers_used:
                # If inlier
                if data_point['predictions']==1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_outliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_inliers_p1 += 1
            else:
                # If outlier
                if data_point['predictions']==-1:
                    # Rules with any P and that include this datapoint
                    df_aux = df_rules[(df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p0 += 1
                    
                    # Rules with P=1 and that include this datapoint
                    df_aux = df_rules[(df_rules['n_inliers_included']==0)
                                      & (df_rules['check']==1)] 
                    if len(df_aux) > 0:
                        n_outliers_p1 += 1
        
        if inliers_used:
            df_rules['n_inliers'] = n_inliers
            df_rules['n_inliers_p0'] = df_out['n_inliers_p0']
            df_rules['n_inliers_p1'] = df_out['n_inliers_p1']
            try:
                del df_rules['check']
            except:
                pass
            path_aux = "inliers"
        else:
            df_rules['n_outliers_p1'] = df_out['n_outliers_p1']
            df_rules['n_outliers_p0'] = df_out['n_outliers_p0']
            df_rules['n_outliers'] = n_outliers
            try:
                del df_rules['check']
            except:
                pass
            path_aux = "outliers"
        
        # Save to CSV
        df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path,
                                                                                    file_name=file_name,
                                                                                    type_r = path_aux),
                                    index=False)
        
            
        df_rules = df_rules[df_rules["n_inliers_included"]==0]
        
        print("Obtaining metrics...")
        df_rules = rule_overlapping_score(df_rules, df_anomalies,
                                          feature_cols, cat_additional)
        
        df_rules = check_stability(df_anomalies, df_rules, clf,
                                    feature_cols, cat_additional,
                                    using_inliers=False)
        
        # Saving rules obtained
        print("Saving rules...")
        df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False)
        
        if plot_fig:
            #### Plot Rules [Outliers]
            print("Plotting rules for outliers...")
            df_rules = df_rules.copy()
            df_rules = df_rules.drop_duplicates().reset_index(drop=True)
            plot_2D(df_rules,
                    df_anomalies,
                    folder = path_folder,
                    path_name = file_name)
            
    else:
        raise ValueError("Argument {0} not found -- use ['all', 'outliers' or 'inliers'] instead".format(rules_used) )
Example #12
0
    if remove_stop:
        # remove stopwords
        stops = set(stopwords.words("english"))
        words = [w for w in words if w not in stops]
    # store in dataframe
    df = pd.DataFrame(words, columns=['word'])
    df['author'] = row.author
    df['datetime'] = row.datetime
    df['text'] = text
    return df

convo_token = Parallel(n_jobs=12)(delayed(tokenize_row)(row)
                        for _, row in tqdm(convo_df.iterrows()))
# unpack text dataframes
convo_token = pd.concat(convo_token, ignore_index=False)
word_counts = convo_token.groupby('author').word.value_counts()
word_counts.name = 'counts'
word_counts = word_counts.reset_index()
plot_ecdf(word_counts.counts.values)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(convo_token[convo_token.author=='Porfi'].word.values))
# Display the generated image:
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(convo_token[convo_token.author=='Ellen'].word.values))
# Display the generated image:
plt.figure()
Example #13
0
def transferability_plink(args):
    """
    Execute trasnferability code
    """
    sumstats = pd.read_table(args.sumstats, delim_whitespace=True)
    sum_snps = sumstats.SNP.tolist()
    if not os.path.isfile(args.refld):
        compute_ld(args.reference, args.refld, args.plinkexe,
                   window=args.window)
    if not os.path.isfile(args.tarld):
        compute_ld(args.target, args.tarld, args.plinkexe, window=args.window)
    df1, snps1 = readLD(args.refld)
    df2, snps2 = readLD(args.tarld)
    available_snps = set(snps1).intersection(snps2).intersection(sum_snps)
    matfile = '%s_matrices.pickle' % args.prefix
    if not os.path.isfile(matfile):
        ld1 = get_blocks(df1, available_snps, args.refld, sliding=args.sliding,
                         cpus=args.threads)
        ld2 = get_blocks(df2, available_snps, args.tarld, sliding=args.sliding,
                         cpus=args.threads)
        pick = pickle.dumps((ld1, ld2))
        with gzip.open(matfile, 'w') as F:
            F.write(pick)
    else:
        print('Loading previously computed blocks')
        with gzip.open(matfile, 'r') as F:
            ld1, ld2 = pickle.loads(F.read())
    print('Setting the loci')
    # loci = Parallel(n_jobs=int(args.threads))(delayed(thelocus)(i, ld1, ld2,
    #                                                            sum_snps)
    #                                          for i in range(len(ld1)))
    loci = [thelocus(index, ld1, ld2, sum_snps) for index in range(len(ld1))]
    avh2 = args.h2 / len(sum_snps)
    with open('%s_loci.pickle' % args.prefix, 'wb') as L:
        pickle.dump(loci, L)
    N = map_count('%s.fam' % args.target)
    resfile = '%s_res.tsv' % args.prefix
    print('Compute expected beta square per locus...')
    if not os.path.isfile(resfile):
        res = Parallel(n_jobs=int(args.threads))(delayed(per_locus)(
            locus, sumstats, avh2, args.h2, N, ld1[i], ld2[i], len(loci)
        ) for i, locus in tqdm(enumerate(loci), total=len(loci)))
        res = pd.concat(res)
        res.to_csv(resfile, index=False, sep='\t')
    else:
        res = pd.read_csv(resfile, sep='\t')
    if args.sliding:
        res = res.groupby('SNP').mean()
        res['SNP'] = res.index.tolist()
    # product, _ = smartcotagsort(args.prefix, res, column='ese')
    product = res.sort_values('ese', ascending=False).reset_index(drop=True)
    product['Index'] = product.index.tolist()
    nsnps = product.shape[0]
    percentages = set_first_step(nsnps, 5, every=False)
    percentages = set_first_step(nsnps, 5, every=False)
    snps = np.around((percentages * nsnps) / 100).astype(int)
    qfile = '%s.qfile' % args.prefix
    if args.qrange is None:
        # qrange= '%s.qrange' % args.prefix
        qr, qrange = gen_qrange(args.prefix, nsnps, 5, qrange, every=False)
    else:
        qrange = args.qrange
        order = ['label', 'Min', 'Max']
        qr = pd.read_csv(qrange, sep=' ', header=None, names=order)
    product.loc[:, ['SNP', 'Index']].to_csv(qfile, sep=' ', header=False,
                                            index=False)
    df = qrscore(args.plinkexe, args.target, args.sumstats, qrange, qfile,
                 args.allele_file, args.pheno, args.prefix, qr, args.maxmem,
                 args.threads, 'None', args.prefix)
    # get ppt results
    # ppts=[]
    # for i in glob('*.results'):
    # three_code = i[:4]
    # results = pd.read_table(i, sep='\t')
    # R2 = results.nlargest(1, 'R2').R2.iloc[0]
    # ppts.append((three_code, R2))
    # ppts = sorted(ppts, key=lambda x: x[1], reverse=True)
    # aest = [('0.5', '*'), ('k', '.')]
    if args.merged is not None:
        merged = pd.read_table(args.merged, sep='\t')
    merged = merged.merge(df, on='Number of SNPs')
    f, ax = plt.subplots()
    merged.plot.scatter(x='Number of SNPs', y='R2', alpha=0.5, c='purple', s=5,
                        ax=ax, label='Transferability', linestyle=':')
    merged.plot.scatter(x='Number of SNPs', y=r'$R^{2}$_cotag', label='Cotagging',
                        c='r', s=2, alpha=0.5, ax=ax)
    merged.plot.scatter(x='Number of SNPs', y='R2_hybrid', c='g', s=5, alpha=0.5,
                        ax=ax, label='Hybrid (COT & P+T)')
    merged.plot.scatter(x='Number of SNPs', y='$R^{2}$_clumEUR', c='0.5', s=5,
                        alpha=0.5, marker='*', ax=ax, label='EUR P+T')
    merged.plot.scatter(x='Number of SNPs', y='$R^{2}$_clumAFR', c='k', s=5,
                        alpha=0.5, marker='.', ax=ax, label='AFR P+T')
    # for i, item in enumerate(ppts):
    # pop, r2 = item
    # ax.axhline(r2, label='%s P + T Best' % pop, color=aest[i][0], ls='--',
    # marker=aest[i][1], markevery=10)
    plt.ylabel('$R^2$')
    plt.legend()
    plt.tight_layout()
    plt.savefig('%s_transferability.pdf' % args.prefix)
    plt.close()
    return res
# get apparent burst size
burstprops_app = Parallel(n_jobs=12)(
    delayed(quant.get_app_bs)(_model, _ctd, _run, _delta, tr)
    for (_model, _ctd, _run, _delta), tr in tqdm(traces.iterrows()))
burstprops_app = pd.concat((burstprops_app), ignore_index=True)

# Fraction of active cells
active_cells_frac_mes = samples[samples.time > 10].groupby(
    ['ctd', 'var_p_val',
     'time']).apply(lambda x: np.sum(x.pol_p > 0) / len(x)).reset_index(
         name='active_cells_frac')

# Get difference between True and apparent burst size
bs_med = burstprops.groupby(['var_p_val', 'ctd', 'run'
                             ])['burst_size'].apply(np.mean).reset_index()
appbs_med = burstprops_app.groupby(['var_p_val', 'ctd', 'run'
                                    ])['app_bs'].apply(np.mean).reset_index()
bs_med = pd.merge(bs_med, appbs_med, on=['var_p_val', 'ctd', 'run'])
bs_dev = bs_med.groupby(['var_p_val',
                         'ctd'])[['burst_size',
                                  'app_bs']].apply(np.mean).reset_index()
bs_dev['bs_dev'] = bs_dev.app_bs - bs_dev.burst_size

# Merge frac active cells with burst size deviation
bsdev_summ = bs_dev.groupby(['var_p_val',
                             'ctd'])[['bs_dev']].mean().reset_index()
actcells_summ = active_cells_frac_mes.groupby(['var_p_val', 'ctd'
                                               ])[['active_cells_frac'
                                                   ]].mean().reset_index()
bs_act_summ = pd.merge(bsdev_summ, actcells_summ, on=['var_p_val', 'ctd'])
bs_act_summ.to_csv('./data/gillespie_bsize_obsvtrue.csv', index=False)
burstprops_app.to_csv('./data/gillespie_burstpropsapp.csv', index=False)