def apply_backtesting(bettor, param_grid, risk_factors, X, scores, odds, cv, random_state, n_runs, n_jobs): """Apply backtesting to evaluate bettor.""" # Check random states random_states = check_random_states(random_state, n_runs) # Check arrays X = check_array(X, dtype=None, force_all_finite=False) normalized_scores = [] for score in scores: normalized_scores.append(check_array(score, dtype=None, ensure_2d=False)) odds = check_array(odds, dtype=None) # Extract parameters parameters = ParameterGrid(param_grid) # Run backtesting data = Parallel(n_jobs=n_jobs)(delayed(fit_bet)(bettor, params, risk_factors, random_state, X, normalized_scores, odds, train_indices, test_indices) for params, random_state, (train_indices, test_indices) in tqdm(list(product(parameters, random_states, cv.split(X))), desc='Tasks')) # Combine data data = pd.concat(data, ignore_index=True) data = data.groupby(['parameters', 'risk_factor', 'experiment']).apply(lambda df: np.concatenate(df.yields.values)).reset_index() data[['coverage', 'mean_yield', 'std_yield']] = pd.DataFrame(data[0].apply(lambda yields: extract_yields_stats(yields)).values.tolist()) # Calculate results results = data.drop(columns=['experiment', 0]).groupby(['parameters', 'risk_factor']).mean().reset_index() results['std_mean_yield'] = data.groupby(['parameters', 'risk_factor'])['mean_yield'].std().values results = results.sort_values('mean_yield', ascending=False).reset_index(drop=True) return results
def get_stock_returns(stocks, start_date, end_date, freq): close_price = Parallel(n_jobs=10, backend='threading', verbose=5)( delayed(csf.get_stock_hist_bar)(code, freq, start_date=start_date, end_date=end_date, field=['date', 'close']) for code in stocks) for start_date, p in zip(stocks, close_price): p['tick'] = start_date close_price = pd.concat(close_price) close_price = close_price.dropna() # index.name原来为空 close_price.index.name = 'dt' # 转成一个frame, index:dt, columns:tick close_price = (close_price.set_index('tick', append=True) .to_panel()['close'] .sort_index() .fillna(method='ffill') ) # 取每个周期末 group_key = {'M': [close_price.index.year, close_price.index.month], 'W': [close_price.index.year, close_price.index.week], 'Q': [close_price.index.year, close_price.index.quarter] } close_price = close_price.groupby(group_key[freq]).tail(1) returns = close_price.pct_change().shift(-1).dropna(axis=1, how='all') returns.index = returns.index.map(lambda dt: str(dt.date())) returns.index.name = 'date' returns = returns.unstack().to_frame() returns.columns = ['ret'] returns = returns.swaplevel(0, 1).sort_index() returns.index.names = ['date', 'code'] return returns
def mergeLinks(self, _inbed): GRPA = ['LINKS'] GRPE = ['LINKS', 'SID'] COL1 = [ '#chrom', 'start_n', 'end_n', 'length_n', 'Order', 'fflag', 'HTSites', 'query_name' ] COL2 = [ '#chrom', 'start_n', 'end_n', 'Type', 'length_n', 'forword_n', 'LINKS', 'Order' ] #Support = _inbed.loc[(_inbed.fflag.str.contains(';HTBREAKP')), COL1 + GRPE]\ # .groupby(by=GRPE, sort=False)\ # .apply(lambda x:self.statCircle(x)).reset_index() # reduce time Support = _inbed.loc[(_inbed.fflag.str.contains(';HTBREAKP')), COL1 + GRPE].groupby(by=GRPE, sort=False) Support = Parallel(n_jobs=-1, backend='loky')(delayed(self.statCircle)(_g) for _l, _g in Support) Support = pd.concat(Support, axis=1).T.infer_objects() Supgrpb = Support.groupby(by=['LINKS'], sort=True) Suplist = [ Supgrpb['support_ID_num'].sum().to_frame('support_num'), Supgrpb['SID'].size().to_frame('support_ID_num'), Supgrpb['Cover'].mean().to_frame('Mean_Cover'), Supgrpb['Depth'].mean().to_frame('Mean_Depth'), Supgrpb['BPHTNum'].mean().to_frame('Mean_BPHTNum'), Supgrpb['SID'].apply(lambda x: x.str.cat(sep=';')).to_frame( 'support_IDs'), Supgrpb['support_ID_num'].apply(lambda x: x.astype(str).str.cat( sep=';')).to_frame('support_read_num'), Supgrpb['Cover'].apply( lambda x: x.astype(str).str.cat(sep=';')).to_frame('Covers'), Supgrpb['Depth'].apply( lambda x: x.astype(str).str.cat(sep=';')).to_frame('Depths'), Supgrpb['BPHTNum'].apply( lambda x: x.astype(str).str.cat(sep=';')).to_frame('BPHTNums') ] Suplist = pd.concat(Suplist, ignore_index=False, join='outer', sort=False, axis=1).reset_index() del Supgrpb inbed = _inbed[COL2].drop_duplicates(keep='first').copy() inbed.rename(columns={ 'start_n': 'start', 'end_n': 'end', 'length_n': 'length', 'forword_n': 'forword' }, inplace=True) inbed = inbed.merge(Suplist, on='LINKS', how='outer') return inbed, Support
def mapanytwo1(self, indf, maxdistance=500, maxreg=True, maxline=3000000, oriant=False): def _splitmap(_inmap): inmap = _inmap.copy() for _n, _l in inmap.iterrows(): S = inmap.start_n.between(_l.start - maxdistance, _l.start + maxdistance, inclusive=True) E = inmap.end_n.between(_l.end - maxdistance, _l.end + maxdistance, inclusive=True) inmap.loc[(S & E), 'start_n'] = inmap[(S & E)]['start'].min() inmap.loc[(S & E), 'end_n'] = inmap[(S & E)]['end'].max() return inmap sortN = ['#chrom', 'start', 'end', 'forword'] mapsN = ['#chrom', 'start', 'end', 'forword', 'start_n', 'end_n'] grpby = ['#chrom', 'forword'] if oriant else ['#chrom'] indf = indf.copy().sort_values(by=sortN) indf[['start_n', 'end_n']] = indf[['start', 'end']] if indf.shape[0] > maxline: inmap = indf[mapsN].drop_duplicates(keep='first') inmap = Parallel(n_jobs=-1, backend='loky')( delayed(_splitmap)(_g) for _, _g in inmap.groupby(by=grpby, sort=False)) inmap = pd.concat(inmap, axis=0) indf = indf.merge(inmap, on=sortN, how='left') else: indf = Parallel(n_jobs=-1, backend='loky')( delayed(_splitmap)(_g) for _, _g in indf.groupby(by=grpby, sort=False)) indf = pd.concat(indf, axis=0) indf[['start_n', 'end_n']] = indf[['start_n', 'end_n']].astype(int) indf[['length_n']] = indf['end_n'] - indf['start_n'] + 1 return indf
def run(self): try: # Get path to all LAS files in directory files = os.listdir(self.dataDir) files = [ os.path.join(self.dataDir, i) for i in files if i.endswith('.las') ] # Get LAS data for specified files lasData = Parallel(n_jobs=self.cpuCount)( delayed(self.readLasFiles)(f) for f in files) lasData = [i for i in lasData if i is not None] lasData = pd.concat(lasData, ignore_index=True) # Scale data if self.clipLog: p1 = np.percentile(lasData[self.logName], 1) p99 = np.percentile(lasData[self.logName], 99) lasData[self.logName] = np.clip(lasData[self.logName], p1, p99) stats = lasData[self.logName].describe() lasData[self.logName] -= stats.loc['mean'] lasData[self.logName] /= stats.loc['std'] stats = lasData[self.logName].describe() lasData[self.logName] -= stats.loc['min'] lasData[self.logName] /= (stats.loc['max'] - stats.loc['min']) # Extract patches and save to disk wellGrps = lasData.groupby('Well') Parallel(n_jobs=self.cpuCount)( delayed(self.saveLasPatches)(wellGrps.get_group(i), i) for i in lasData['Well'].unique()) except Exception as e: print('Something broke', e)
def coloc_sim(data, radius=3, min_count=5, n_cores=1, copy=False): """Calculate pairwise gene colocalization similarity with the cross L function. Parameters ---------- adata : AnnData Anndata formatted spatial data. radius : int Max radius to search for neighboring points, by default 3 min_count : int Minimum points needed to be eligible for analysis. Returns ------- adata : AnnData .uns['coloc_sim']: Pairwise gene colocalization similarity within each cell formatted as a long dataframe. """ adata = data.copy() if copy else data # Filter points and counts by min_count counts = adata.to_df() # Helper function to apply per cell def cell_coloc_sim(p, g_density, name): # Get xy coordinates xy = p[["x", "y"]].values # Get neighbors within fixed outer_radius for every point nn = NearestNeighbors(radius=radius).fit(xy) distances, point_index = nn.radius_neighbors(xy, return_distance=True) # Enumerate point-wise gene labels gene_index = p["gene"].reset_index( drop=True).cat.remove_unused_categories() # Convert to adjacency list of points, no double counting neighbor_pairs = [] for g1, neighbors, n_dists in zip(gene_index.values, point_index, distances): for g2, d in zip(neighbors, n_dists): neighbor_pairs.append([g1, g2, d]) # Calculate pair-wise gene similarity neighbor_pairs = pd.DataFrame(neighbor_pairs, columns=["g1", "g2", "p_dist"]) # Keep minimum distance to g2 point neighbor_pairs = neighbor_pairs.groupby(["g1", "g2" ]).agg("min").reset_index() neighbor_pairs.columns = ["g1", "g2", "point_dist"] # Map to gene index neighbor_pairs["g2"] = neighbor_pairs["g2"].map(gene_index) # Count number of points within distance of increasing radius r_step = 0.5 expected_counts = [ lambda dists: (dists <= r).sum() for r in np.arange(r_step, radius + r_step, r_step) ] metrics = (neighbor_pairs.groupby(["g1", "g2"]).agg({ "point_dist": expected_counts }).reset_index()) # Colocalization metric: max of L_ij(r) for r <= radius g2_density = g_density.loc[metrics["g2"].tolist()].values metrics["sim"] = ((metrics["point_dist"].divide( g2_density * np.pi, axis=0)).pow(0.5).max(axis=1)) metrics["cell"] = name # Ignore self colocalization # metrics = metrics.loc[metrics["g1"] != metrics["g2"]] return metrics[["cell", "g1", "g2", "sim"]] # Only keep genes >= min_count in each cell gene_densities = [] counts.apply(lambda row: gene_densities.append(row[row >= min_count]), axis=1) # Calculate point density per gene per cell gene_densities /= adata.obs["cell_area"] gene_densities = gene_densities.values # TODO dask cell_metrics = Parallel(n_jobs=n_cores)(delayed(cell_coloc_sim)( get_points(adata, cells=g_density.name, genes=g_density.index.tolist(), asgeo=True), g_density, g_density.name, ) for g_density in tqdm(gene_densities)) cell_metrics = pd.concat(cell_metrics) cell_metrics.columns = cell_metrics.columns.get_level_values(0) # Make symmetric (Lij = Lji) cell_metrics["pair"] = cell_metrics.apply( lambda row: "-".join(sorted([row["g1"], row["g2"]])), axis=1) cell_symmetric = cell_metrics.groupby(["cell", "pair"]).mean() # Retain gene pair names cell_symmetric = (cell_metrics.set_index(["cell", "pair"]).drop( "sim", axis=1).join(cell_symmetric).reset_index()) # Aggregate across cells coloc_agg = cell_symmetric.groupby(["pair"])["sim"].mean().to_frame() coloc_agg = (coloc_agg.join( cell_symmetric.set_index("pair").drop( ["sim", "cell"], axis=1)).reset_index().drop_duplicates()) # Save coloc similarity cell_metrics[["cell", "g1", "g2", "pair"]].astype("category", copy=False) coloc_agg[["g1", "g2", "pair"]].astype("category", copy=False) adata.uns["coloc_sim"] = cell_metrics adata.uns["coloc_sim_agg"] = coloc_agg return adata if copy else None
def typeCat(self, indf, dropcigarover=True, dropneighbdup=True, minalignlenght=100): dropcigarover = self.dropcigarover #True dropneighbdup = self.dropneighbdup #True GRPBY = ['SID', 'query_name'] self.log.CI('start droping overlap of mapping region: ' + self.inid) # drop min align lenght indf.loc[(np.abs(indf.cigarreg.str[1] - indf.cigarreg.str[0]) < self.minalignlenght - 1), 'fflag'] = 'LOWALIGN' LOWA = indf[(indf.fflag == 'LOWALIGN')] indf = indf[(indf.fflag != 'LOWALIGN')] # dropcigarover if dropcigarover: indf = Parallel(n_jobs=-1, backend='threading')( delayed(self.dropCigarOver)(_g) for _, _g in indf.groupby(by=GRPBY, sort=False)) indf = pd.concat(indf, axis=0, sort=False) OVER = indf[(indf.fflag == 'OVER')] indf = indf[(indf.fflag != 'OVER')] # maxbeddistance self.log.CI('start computing maximal distance of mapping region: ' + self.inid) indf = Parallel(n_jobs=-1, backend='threading')( delayed(self.maxBedDistance)(_g) for _, _g in indf.groupby(by=GRPBY, sort=False)) indf = pd.concat(indf, axis=0, sort=False) DIST = indf[(indf.fflag != 'HTDIST')] indf = indf[(indf.fflag == 'HTDIST')] # mergeNeighb self.log.CI('start merging neighbour duplcations of mapping region: ' + self.inid) indf = Parallel(n_jobs=-1, backend='threading')( delayed(self.mergeNeighb)(_g) for _, _g in indf.groupby(by=GRPBY, sort=False)) indf = pd.concat(indf, axis=0, sort=False) DUPL = indf[(indf.fflag.str.contains('DUPLIC', regex=False))] indf = indf[~(indf.fflag.str.contains('DUPLIC', regex=False))] # markEcDNA self.log.CI('start marking and merging head-to-tail mapping region: ' + self.inid) indf = Parallel(n_jobs=-1, backend='threading')( delayed(self.markKeep)(_g) for _, _g in indf.groupby(by=GRPBY, sort=False)) indf = pd.concat(indf, axis=0, sort=False) LINE = indf[~((indf.fflag.str.contains('EcDNA')) & ~(indf.fflag.str.contains('MISS')))] indf = indf[((indf.fflag.str.contains('EcDNA')) & ~(indf.fflag.str.contains('MISS')))] # mergeHeadTail self.log.CI('start merging head-to-tail mapping region: ' + self.inid) indf = Parallel(n_jobs=-1, backend='threading')( delayed(self.mergeHeadTail)(_g) for _, _g in indf.groupby(by=GRPBY, sort=False)) indf = pd.concat(indf, axis=0, sort=False) # concat MARK = pd.concat([LOWA, OVER, DIST, DUPL, LINE, indf], axis=0, sort=False) del ( LOWA, OVER, DIST, DUPL, LINE, ) # headtailregion self.log.CI('start adding heat/tail site to a new column: ' + self.inid) KEEP = indf.merge(indf.groupby(by=GRPBY, sort=False)\ .apply(lambda x: x.loc[(x.fflag.str.contains(';HEAD|;TAIL')), ['start','end']].values.tolist())\ .to_frame(name='HTSites').reset_index(), on=GRPBY) #KEEP.loc[~KEEP.fflag.str.contains('HTBREAKP'), 'HTSites'] = '' KEEP = KEEP[~(KEEP.fflag.str.contains('HEAD|TAIL', regex=True))] MARK.to_csv(self.arg.outpre + '.Mark', sep='\t', index=False) KEEP.to_csv(self.arg.outpre + '.Keep', sep='\t', index=False) del (MARK, KEEP, indf)
plt.plot(data.mjd_short, data.flux_5); ax.set_title('{} - {}'.format(s_ind, len(data))) fig.suptitle(obj_id); fig.tight_layout() # %% # passband_diff histogram plt.hist([dataset_proc.passband_diff], bins='auto'); # %% # Number of observations histogram lens = [ max(g.n_obs)+1 for name,g in dataset_proc.groupby('object_id')] plt.hist(lens, bins='auto'); for l in set(lens): l_lens = len([ li for li in lens if li==l]) print('{}: {:.2f}% - {}'.format(l, l_lens/len(lens), l_lens)) # %% # Length of each observation lens = [ len(g) for name,g in dataset_proc.groupby(['object_id', 'n_obs'])] plt.hist(lens, bins='auto');
filename = Path("C:/Users/Dustin/Desktop/datafile.din") if not filename.exists(): with open(str(filename), "wb") as handle: for data in tqdm(response.iter_content()): handle.write(data) # Parse file cc_file = Parallel(n_jobs=NUM_CORES)(delayed(validate_line)(line) for line in open(str(filename), 'r').readlines()) cc_file = [x for x in cc_file if x is not None] # Convert to dataframe cc_file = pd.DataFrame(cc_file, columns=['op_type', 'register']) # Bar plot of frequency by register print('Plotting bar plot...') plot_data = cc_file.groupby('register').size().plot() plt.show(block=True) print('Frequency by op_type:') print(cc_file.groupby('op_type').size()) # B ####### A_int = generate_matrix_int(348, 200) A_dbl = generate_matrix_dbl(348, 200) B_int = generate_matrix_int(200, 140) B_dbl = generate_matrix_dbl(200, 140) row_int = timeit.repeat('mult_row(A_int, B_int)', 'from __main__ import mult_row, A_int, B_int', number=1, repeat=10) row_dbl = timeit.repeat('mult_row(A_dbl, B_dbl)', 'from __main__ import mult_row, A_dbl, B_dbl', number=1, repeat=10)
def run_seir_varying_control_simulations(var, R0): # convert to two type: index_map = [np.arange(0, 12), np.arange(12, 15)] # todo: may be computationally smart to move this outside par = h.get_two_type_params(index_map=index_map, R0=R0) #par["stay_duration"] = 6 par["tmax"] = 365*10 # todo: check this is long enough par["hosp_cap"] = 17800 # par["hosp_rate"] = 0.0368 hc_range = np.arange(2200, 60000, 1200) hc_range[28] = 35600 # add exactly double 17800 to list sd_range = np.arange(1, 25, 1) if var == "hosp_cap": var_range = hc_range if var == "stay_duration": var_range = sd_range start_time = time.time() # tt = pd.concat([get_two_type_df(hosp_cap=h) for h in # np.arange(0.1, 1.1, 0.1)*1e5]) sim_df = Parallel(n_jobs=n_cores)(delayed(two_type_df_wrapper)(par, var, v) for v in var_range) sim_df = pd.concat(sim_df) sim_df = sim_df.sort_values([var, "time"]) print("--- %s seconds ---" % (time.time() - start_time)) def get_ctl_dur(df): df_f = df[df["control"] > 0] return df_f.index.max() - df_f.index.min() def get_final_full_R0(df): return df["Reff_full"].iloc[-1] def get_final_sgl_R0(df): return df["Reff_single"].iloc[-1] def get_S0_start(df): df_f = df[df["control"] > 0] return df_f.loc[df_f.index.min(), "S0"] out_g = sim_df.groupby(var) out_stats = pd.DataFrame({"control_duration": out_g.apply(get_ctl_dur), "final_full_R0": out_g.apply(get_final_full_R0), "final_single_R0": out_g.apply(get_final_sgl_R0)}) out_stats["full_hi"] = out_stats["final_full_R0"] < 1 out_stats["S0"] = out_g.apply(get_S0_start) def mt_wrapper(x): par_x = {**par, **{var: x.name, "S0": x["S0"]}} return get_min_time_to_herd_immunity_approx(par_x) out_stats["approx_time_to_hi"] = out_stats.apply(mt_wrapper, axis=1) return sim_df, out_stats
def ocsvm_rules_experiments_pipeline(df_mat, numerical_cols, categorical_cols, cluster_algorithm, method, rules_used, dct_params, path_folder, file_template, store_intermediate=False, plot_fig=False): """ Parameters ---------- df_mat : TYPE DESCRIPTION. numerical_cols : TYPE DESCRIPTION. categorical_cols : TYPE DESCRIPTION. cluster_algorithm : TYPE DESCRIPTION. method : TYPE DESCRIPTION. rules_used : TYPE DESCRIPTION. dct_params : TYPE DESCRIPTION. path_folder : TYPE DESCRIPTION. file_template : TYPE DESCRIPTION. plot_fig : TYPE, optional DESCRIPTION. The default is False. Returns ------- None. """ print("Beginning process...") if rules_used == "all" or rules_used == "inliers": print("\n\n") print("*"*100) print("Obtaining Rules for Inliers...") print("*"*100) use_inverse = False file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) #### Obtain Rules [Inliers] if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (inliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (inliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols df_anomalies = df_anomalies df_rules = df_all inliers_used=True clustering_algorithm=cluster_algorithm path=path_folder file_name=file_name df_rules['n_inliers_included'] = 0 df_rules['n_outliers_included'] = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols)) print("Checking inliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_inliers_included'] = df_check print("Checking outliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_outliers_included'] = df_check # Check how many datapoints are included with the rules with Precision=1 print("Checking inliers/outliers inside hypercubes with Precision=1...") n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) def wrapper_precision_check(data_point): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 return {'n_inliers_p0':n_inliers_p0, 'n_inliers_p1':n_inliers_p1, 'n_outliers_p0':n_outliers_p0, 'n_outliers_p1':n_outliers_p1} dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows()) df_out = pd.DataFrame(dct_out).sum() for i, data_point in df_anomalies.iterrows(): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 if inliers_used: df_rules['n_inliers'] = n_inliers df_rules['n_inliers_p0'] = df_out['n_inliers_p0'] df_rules['n_inliers_p1'] = df_out['n_inliers_p1'] try: del df_rules['check'] except: pass path_aux = "inliers" else: df_rules['n_outliers_p1'] = df_out['n_outliers_p1'] df_rules['n_outliers_p0'] = df_out['n_outliers_p0'] df_rules['n_outliers'] = n_outliers try: del df_rules['check'] except: pass path_aux = "outliers" # Save to CSV df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path, file_name=file_name, type_r = path_aux), index=False) # Use only pure rules df_rules = df_rules[df_rules["n_outliers_included"]==0] print("Obtaining metrics...") df_rules = rule_overlapping_score(df_rules, df_anomalies, feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=True) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Inliers] print("Plotting rules for inliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder = path_folder, path_name=file_name) if rules_used == "all" or rules_used == "outliers": print("\n\n") print("*"*100) print("Obtaining Rules for Outliers...") print("*"*100) #### Obtain Rules [Outliers] use_inverse = True file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=False, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols # Complete Rules print("Checking outliers inside hypercubes...") df_anomalies['predictions'] = df_anomalies['predictions']*-1 df_anomalies['distances'] = df_anomalies['distances']*-1 df_anomalies = df_anomalies df_rules = df_all inliers_used=False clustering_algorithm=cluster_algorithm path=path_folder file_name=file_name df_rules['n_inliers_included'] = 0 df_rules['n_outliers_included'] = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols)) print("Checking inliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_inliers_included'] = df_check print("Checking outliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_outliers_included'] = df_check # Check how many datapoints are included with the rules with Precision=1 print("Checking inliers/outliers inside hypercubes with Precision=1...") n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) def wrapper_precision_check(data_point): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 return {'n_inliers_p0':n_inliers_p0, 'n_inliers_p1':n_inliers_p1, 'n_outliers_p0':n_outliers_p0, 'n_outliers_p1':n_outliers_p1} dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows()) df_out = pd.DataFrame(dct_out).sum() for i, data_point in df_anomalies.iterrows(): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 if inliers_used: df_rules['n_inliers'] = n_inliers df_rules['n_inliers_p0'] = df_out['n_inliers_p0'] df_rules['n_inliers_p1'] = df_out['n_inliers_p1'] try: del df_rules['check'] except: pass path_aux = "inliers" else: df_rules['n_outliers_p1'] = df_out['n_outliers_p1'] df_rules['n_outliers_p0'] = df_out['n_outliers_p0'] df_rules['n_outliers'] = n_outliers try: del df_rules['check'] except: pass path_aux = "outliers" # Save to CSV df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path, file_name=file_name, type_r = path_aux), index=False) df_rules = df_rules[df_rules["n_inliers_included"]==0] print("Obtaining metrics...") df_rules = rule_overlapping_score(df_rules, df_anomalies, feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=False) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Outliers] print("Plotting rules for outliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder = path_folder, path_name = file_name) else: raise ValueError("Argument {0} not found -- use ['all', 'outliers' or 'inliers'] instead".format(rules_used) )
if remove_stop: # remove stopwords stops = set(stopwords.words("english")) words = [w for w in words if w not in stops] # store in dataframe df = pd.DataFrame(words, columns=['word']) df['author'] = row.author df['datetime'] = row.datetime df['text'] = text return df convo_token = Parallel(n_jobs=12)(delayed(tokenize_row)(row) for _, row in tqdm(convo_df.iterrows())) # unpack text dataframes convo_token = pd.concat(convo_token, ignore_index=False) word_counts = convo_token.groupby('author').word.value_counts() word_counts.name = 'counts' word_counts = word_counts.reset_index() plot_ecdf(word_counts.counts.values) from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # lower max_font_size, change the maximum number of word and lighten the background: wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(convo_token[convo_token.author=='Porfi'].word.values)) # Display the generated image: plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") # lower max_font_size, change the maximum number of word and lighten the background: wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(convo_token[convo_token.author=='Ellen'].word.values)) # Display the generated image: plt.figure()
def transferability_plink(args): """ Execute trasnferability code """ sumstats = pd.read_table(args.sumstats, delim_whitespace=True) sum_snps = sumstats.SNP.tolist() if not os.path.isfile(args.refld): compute_ld(args.reference, args.refld, args.plinkexe, window=args.window) if not os.path.isfile(args.tarld): compute_ld(args.target, args.tarld, args.plinkexe, window=args.window) df1, snps1 = readLD(args.refld) df2, snps2 = readLD(args.tarld) available_snps = set(snps1).intersection(snps2).intersection(sum_snps) matfile = '%s_matrices.pickle' % args.prefix if not os.path.isfile(matfile): ld1 = get_blocks(df1, available_snps, args.refld, sliding=args.sliding, cpus=args.threads) ld2 = get_blocks(df2, available_snps, args.tarld, sliding=args.sliding, cpus=args.threads) pick = pickle.dumps((ld1, ld2)) with gzip.open(matfile, 'w') as F: F.write(pick) else: print('Loading previously computed blocks') with gzip.open(matfile, 'r') as F: ld1, ld2 = pickle.loads(F.read()) print('Setting the loci') # loci = Parallel(n_jobs=int(args.threads))(delayed(thelocus)(i, ld1, ld2, # sum_snps) # for i in range(len(ld1))) loci = [thelocus(index, ld1, ld2, sum_snps) for index in range(len(ld1))] avh2 = args.h2 / len(sum_snps) with open('%s_loci.pickle' % args.prefix, 'wb') as L: pickle.dump(loci, L) N = map_count('%s.fam' % args.target) resfile = '%s_res.tsv' % args.prefix print('Compute expected beta square per locus...') if not os.path.isfile(resfile): res = Parallel(n_jobs=int(args.threads))(delayed(per_locus)( locus, sumstats, avh2, args.h2, N, ld1[i], ld2[i], len(loci) ) for i, locus in tqdm(enumerate(loci), total=len(loci))) res = pd.concat(res) res.to_csv(resfile, index=False, sep='\t') else: res = pd.read_csv(resfile, sep='\t') if args.sliding: res = res.groupby('SNP').mean() res['SNP'] = res.index.tolist() # product, _ = smartcotagsort(args.prefix, res, column='ese') product = res.sort_values('ese', ascending=False).reset_index(drop=True) product['Index'] = product.index.tolist() nsnps = product.shape[0] percentages = set_first_step(nsnps, 5, every=False) percentages = set_first_step(nsnps, 5, every=False) snps = np.around((percentages * nsnps) / 100).astype(int) qfile = '%s.qfile' % args.prefix if args.qrange is None: # qrange= '%s.qrange' % args.prefix qr, qrange = gen_qrange(args.prefix, nsnps, 5, qrange, every=False) else: qrange = args.qrange order = ['label', 'Min', 'Max'] qr = pd.read_csv(qrange, sep=' ', header=None, names=order) product.loc[:, ['SNP', 'Index']].to_csv(qfile, sep=' ', header=False, index=False) df = qrscore(args.plinkexe, args.target, args.sumstats, qrange, qfile, args.allele_file, args.pheno, args.prefix, qr, args.maxmem, args.threads, 'None', args.prefix) # get ppt results # ppts=[] # for i in glob('*.results'): # three_code = i[:4] # results = pd.read_table(i, sep='\t') # R2 = results.nlargest(1, 'R2').R2.iloc[0] # ppts.append((three_code, R2)) # ppts = sorted(ppts, key=lambda x: x[1], reverse=True) # aest = [('0.5', '*'), ('k', '.')] if args.merged is not None: merged = pd.read_table(args.merged, sep='\t') merged = merged.merge(df, on='Number of SNPs') f, ax = plt.subplots() merged.plot.scatter(x='Number of SNPs', y='R2', alpha=0.5, c='purple', s=5, ax=ax, label='Transferability', linestyle=':') merged.plot.scatter(x='Number of SNPs', y=r'$R^{2}$_cotag', label='Cotagging', c='r', s=2, alpha=0.5, ax=ax) merged.plot.scatter(x='Number of SNPs', y='R2_hybrid', c='g', s=5, alpha=0.5, ax=ax, label='Hybrid (COT & P+T)') merged.plot.scatter(x='Number of SNPs', y='$R^{2}$_clumEUR', c='0.5', s=5, alpha=0.5, marker='*', ax=ax, label='EUR P+T') merged.plot.scatter(x='Number of SNPs', y='$R^{2}$_clumAFR', c='k', s=5, alpha=0.5, marker='.', ax=ax, label='AFR P+T') # for i, item in enumerate(ppts): # pop, r2 = item # ax.axhline(r2, label='%s P + T Best' % pop, color=aest[i][0], ls='--', # marker=aest[i][1], markevery=10) plt.ylabel('$R^2$') plt.legend() plt.tight_layout() plt.savefig('%s_transferability.pdf' % args.prefix) plt.close() return res
# get apparent burst size burstprops_app = Parallel(n_jobs=12)( delayed(quant.get_app_bs)(_model, _ctd, _run, _delta, tr) for (_model, _ctd, _run, _delta), tr in tqdm(traces.iterrows())) burstprops_app = pd.concat((burstprops_app), ignore_index=True) # Fraction of active cells active_cells_frac_mes = samples[samples.time > 10].groupby( ['ctd', 'var_p_val', 'time']).apply(lambda x: np.sum(x.pol_p > 0) / len(x)).reset_index( name='active_cells_frac') # Get difference between True and apparent burst size bs_med = burstprops.groupby(['var_p_val', 'ctd', 'run' ])['burst_size'].apply(np.mean).reset_index() appbs_med = burstprops_app.groupby(['var_p_val', 'ctd', 'run' ])['app_bs'].apply(np.mean).reset_index() bs_med = pd.merge(bs_med, appbs_med, on=['var_p_val', 'ctd', 'run']) bs_dev = bs_med.groupby(['var_p_val', 'ctd'])[['burst_size', 'app_bs']].apply(np.mean).reset_index() bs_dev['bs_dev'] = bs_dev.app_bs - bs_dev.burst_size # Merge frac active cells with burst size deviation bsdev_summ = bs_dev.groupby(['var_p_val', 'ctd'])[['bs_dev']].mean().reset_index() actcells_summ = active_cells_frac_mes.groupby(['var_p_val', 'ctd' ])[['active_cells_frac' ]].mean().reset_index() bs_act_summ = pd.merge(bsdev_summ, actcells_summ, on=['var_p_val', 'ctd']) bs_act_summ.to_csv('./data/gillespie_bsize_obsvtrue.csv', index=False) burstprops_app.to_csv('./data/gillespie_burstpropsapp.csv', index=False)