def bootstrap_reconstruction(testing_activity, targets, iterations, WM, WM_t, Inter, region, condition, subject, ref_angle=180): Reconstructions_boots=[] for n_rep in range(iterations): time_rec_boot_start=time.time() indexes_boots = np.random.randint(0,len(targets), len(targets)) #bootstraped indexes for reconstruction ### make the reconstryctions and append them targets_boot = targets[indexes_boots] signal_boots = testing_activity[indexes_boots, :, :] signal_boots_paralel =[ signal_boots[:, i, :] for i in range(nscans_wm)] Reconstructions_boot = Parallel(n_jobs = numcores)(delayed(Representation)(signal, targets_boot, WM, WM_t, ref_angle=180, plot=False, intercept=Inter) for signal in signal_boots_paralel) #### reconstruction standard (paralel) Reconstructions_boot = pd.concat(Reconstructions_boot, axis=1) #mean of all the trials Reconstructions_boot.columns = [str(i * TR) for i in range(nscans_wm)] #column names Reconstructions_boots.append(Reconstructions_boot) #append the reconstruction (of the current iteration) time_rec_boot_end = time.time() #time time_rec_boot = time_rec_boot_end - time_rec_boot_start print('boot_' + str(n_rep) + ': ' +str(time_rec_boot) ) #print time of the reconstruction shuffled ### Get just the supposed target location df_boots=[] for i in range(len(Reconstructions_boots)): n = Reconstructions_boots[i].iloc[ref_angle*2, :] #around the ref_angle (x2 beacuse now we have 720 instead of 360) n = n.reset_index() n.columns = ['times', 'decoding'] n['decoding'] = [sum(Reconstructions_boots[i].iloc[:, ts] * f2(ref_angle)) for ts in range(len(n))] #population vector method (scalar product) n['times']=n['times'].astype(float) n['region'] = region n['subject'] = subject n['condition'] = condition df_boots.append(n) #save thhis ## df_boots = pd.concat(df_boots) #same shape as the decosing of the signal return df_boots
def create_training_data(): num_cores = 8 # getting total number of trips list_of_files = [[folder, f.replace('.csv','')] for folder in os.listdir('drivers') if 'DS_Store' not in folder for f in os.listdir('drivers/'+folder) if '.csv' in f] raw_data = Parallel( n_jobs=num_cores )(delayed(create_attributes)(i) for i in list_of_files) raw_data = pd.DataFrame(raw_data) raw_data.columns = ['driver_trip','trip_time','total_distance','skyway_distance','avg_speed','std_speed', 'avg_speed_up','avg_speed_down', 'avg_acc','std_acc','avg_turn','std_turn','standing_time','standing_speed'] # save to file for later training raw_data.to_csv('training_set.csv', index=False) return raw_data
return new_image def ImageLoad(i): img = load_image(i, im_sz=224) Petid = i.split('.')[0] Petid = Petid.split('/')[2] return img, Petid images = os.listdir(data_dir + '/train_images/') images = [i for i in images if 'jpg' in i] imgs = Parallel(n_jobs=-1)(delayed(ImageLoad)(data_dir + '/train_images/' + i) for i in images) imgs = pd.DataFrame(imgs) imgs.columns = np.array(['ndarray', 'PetId']) img = list(imgs['ndarray']) img = np.array(img) PetID = imgs['PetId'] ###############Dont run Always TrainMetaData = pd.read_csv('Results/TrainMetaData.csv') TrainSentiment = pd.read_csv('Results/TrainSentiment.csv') ####### Section 4: Reordering the Data Set Data_in = pd.read_csv('Data/train.csv') Data1 = Data_in[[ 'Age', 'MaturitySize', 'FurLength',
VariantTable['variant_ID'][i], BindingSeries, np.array(Times), concentrations, plotLocation, VariantTable[VariantTable['variant_ID'] == VariantTable['variant_ID'][i]].iloc[0, 3], normalization, numBootstraps=nBootstraps, plotBootstrap=Plot) for i in range(len(VariantTable))) results = pd.DataFrame(results) results.columns = ('variant_ID', 'kobs', 'kobs_err', 'fmin', 'fminerror', 'fmax', 'fmaxerror', 'rsquared', 'ier', 'rmse', 'Kon_50', 'Kon_2p5', 'Kon_97p5', 'fmin_50', 'fmin_2p5', 'fmin_97p5', 'fmax_50', 'fmax_2p5', 'fmax_97p5', 'nClusters', 'fmax_values') results = pd.merge(VariantTable, results, on='variant_ID') results.to_csv(figSaveLocPrefix + "fitOnRates.csv", index=False) else: # Non-parallelized version: VariantTable[[ 'variant_ID', 'kobs', 'kobs_err', 'fmin', 'fminerror', 'fmax', 'fmaxerror', 'rsquared', 'ier', 'rmse', 'Kon_50', 'Kon_2p5', 'Kon_97p5', 'fmin_50', 'fmin_2p5', 'fmin_97p5', 'fmax_50', 'fmax_2p5', 'fmax_97p5', 'nClusters', 'fmax_values' ]] = VariantTable['variant_ID'].apply(lambda x: fitkonWrapperBootstrap( x, BindingSeries, np.array(Times),
time_binary = pd.DataFrame(time_binary) time_binary.reset_index(inplace=True) time_binary.drop('index', axis=1, inplace=True) # if observation took place before spray, zero out time # else return elapsed time between spray and observation print('negating sprays after traps...') for col in time_binary.columns: time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x) print('done') # https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/ time_binary.columns = distance_binary.columns time_binary_backup = time_binary.copy() distance_binary_backup = distance_binary.copy() time_tp = time_binary.transpose() distance_tp = distance_binary.transpose() time_tp = time_binary.transpose() distance_tp = distance_binary.transpose() def CalculateDistance(i): distances = [] if i % 500 == 0: print('evaluating time binaries ' + str(i)) d = i
] # print ("FOS_list:",FOS_list) value = [value[i] * FOS_list[i] for i in range(len(value))] out.append(value) return out adjusted_scores = Parallel(n_jobs=-1)( delayed(get_adjusted_motif_score)(motif_score, bw_values, top_n) for motif_score in score_list_A) def set_col_names(motifs, top_n, label): out = [] for i in motifs: for j in range(top_n): out.append("%s_%s_%s" % (label, i, j)) return out ## get feature table adjusted_scores = np.array(adjusted_scores) adjusted_scores = np.swapaxes(adjusted_scores, 0, 1) adjusted_scores = adjusted_scores.reshape( (len(high + low), top_n * len(motifs))) adjusted_scores = pd.DataFrame(adjusted_scores) adjusted_scores.columns = set_col_names(motifs, top_n, "motif_footprint_score") adjusted_scores.index = high + low df = pd.concat([adjusted_scores, data], axis=1) df.to_csv("ML_data.csv")
def coloc_sim(data, radius=3, min_count=5, n_cores=1, copy=False): """Calculate pairwise gene colocalization similarity with the cross L function. Parameters ---------- adata : AnnData Anndata formatted spatial data. radius : int Max radius to search for neighboring points, by default 3 min_count : int Minimum points needed to be eligible for analysis. Returns ------- adata : AnnData .uns['coloc_sim']: Pairwise gene colocalization similarity within each cell formatted as a long dataframe. """ adata = data.copy() if copy else data # Filter points and counts by min_count counts = adata.to_df() # Helper function to apply per cell def cell_coloc_sim(p, g_density, name): # Get xy coordinates xy = p[["x", "y"]].values # Get neighbors within fixed outer_radius for every point nn = NearestNeighbors(radius=radius).fit(xy) distances, point_index = nn.radius_neighbors(xy, return_distance=True) # Enumerate point-wise gene labels gene_index = p["gene"].reset_index( drop=True).cat.remove_unused_categories() # Convert to adjacency list of points, no double counting neighbor_pairs = [] for g1, neighbors, n_dists in zip(gene_index.values, point_index, distances): for g2, d in zip(neighbors, n_dists): neighbor_pairs.append([g1, g2, d]) # Calculate pair-wise gene similarity neighbor_pairs = pd.DataFrame(neighbor_pairs, columns=["g1", "g2", "p_dist"]) # Keep minimum distance to g2 point neighbor_pairs = neighbor_pairs.groupby(["g1", "g2" ]).agg("min").reset_index() neighbor_pairs.columns = ["g1", "g2", "point_dist"] # Map to gene index neighbor_pairs["g2"] = neighbor_pairs["g2"].map(gene_index) # Count number of points within distance of increasing radius r_step = 0.5 expected_counts = [ lambda dists: (dists <= r).sum() for r in np.arange(r_step, radius + r_step, r_step) ] metrics = (neighbor_pairs.groupby(["g1", "g2"]).agg({ "point_dist": expected_counts }).reset_index()) # Colocalization metric: max of L_ij(r) for r <= radius g2_density = g_density.loc[metrics["g2"].tolist()].values metrics["sim"] = ((metrics["point_dist"].divide( g2_density * np.pi, axis=0)).pow(0.5).max(axis=1)) metrics["cell"] = name # Ignore self colocalization # metrics = metrics.loc[metrics["g1"] != metrics["g2"]] return metrics[["cell", "g1", "g2", "sim"]] # Only keep genes >= min_count in each cell gene_densities = [] counts.apply(lambda row: gene_densities.append(row[row >= min_count]), axis=1) # Calculate point density per gene per cell gene_densities /= adata.obs["cell_area"] gene_densities = gene_densities.values # TODO dask cell_metrics = Parallel(n_jobs=n_cores)(delayed(cell_coloc_sim)( get_points(adata, cells=g_density.name, genes=g_density.index.tolist(), asgeo=True), g_density, g_density.name, ) for g_density in tqdm(gene_densities)) cell_metrics = pd.concat(cell_metrics) cell_metrics.columns = cell_metrics.columns.get_level_values(0) # Make symmetric (Lij = Lji) cell_metrics["pair"] = cell_metrics.apply( lambda row: "-".join(sorted([row["g1"], row["g2"]])), axis=1) cell_symmetric = cell_metrics.groupby(["cell", "pair"]).mean() # Retain gene pair names cell_symmetric = (cell_metrics.set_index(["cell", "pair"]).drop( "sim", axis=1).join(cell_symmetric).reset_index()) # Aggregate across cells coloc_agg = cell_symmetric.groupby(["pair"])["sim"].mean().to_frame() coloc_agg = (coloc_agg.join( cell_symmetric.set_index("pair").drop( ["sim", "cell"], axis=1)).reset_index().drop_duplicates()) # Save coloc similarity cell_metrics[["cell", "g1", "g2", "pair"]].astype("category", copy=False) coloc_agg[["g1", "g2", "pair"]].astype("category", copy=False) adata.uns["coloc_sim"] = cell_metrics adata.uns["coloc_sim_agg"] = coloc_agg return adata if copy else None
# structResultsPath="~/Tesis/rriPredMethod/data/bench_joan_2411/bipspi_v2/bench5_and_bench_2411/results/struct_2/" seqPrefixes = getPrefixes(seqResultsPath) mixedPrefixes = getPrefixes(mixedResultsPath) structPrefixes = getPrefixes(structResultsPath) sharedPrefixes = seqPrefixes.union(mixedPrefixes).union(structPrefixes) # sharedPrefixes=list(sharedPrefixes)[:10] results = Parallel(n_jobs=N_JOBS)(delayed(processOnePrefix)( prefix, ligOrRec, seqResultsPath, mixedResultsPath, structResultsPath) for prefix in sharedPrefixes for ligOrRec in ["lig", "rec"]) results = [elem for elem in results if elem is not None] results = pd.DataFrame(results) results.columns = [ "deltaRoc", "rocSeq", "rocMixed", "rocStruct", "ligOrRec", "prefix" ] results.sort_values(by=["deltaRoc"], ascending=True, inplace=True) results = results[results["rocMixed"] > 0.7] print(results) # results.to_csv("seqVsMixVsStruct.csv", sep="\t", index=False) #fname="seqVsMixVsStruct.csv" #x= pd.read_csv(fname, header="infer", sep="\t") #x[ (0.8<x.rocStruct) & (x.rocStruct<0.95) & (x.rocSeq<0.8) & (x.rocSeq< x.rocMixed)].head()
twitter_df['clean_text'] = twitter_df['tweet_text'].map( lambda x: cleaner(str(x))) # Drop data without geospatial coordinates twitter_df = twitter_df[(twitter_df['latitude'] != 0) & (twitter_df['latitude'] != np.NaN)] twitter_df = twitter_df.reset_index(drop=True) # Set cores ncores = multiprocessing.cpu_count() - 1 # Set sentiment analyser sentiment_analyser = vader.SentimentIntensityAnalyzer() # Run code output = Parallel(n_jobs=ncores)( delayed(map_sentiment_vader)(str(tweet), sentiment_analyser) for tweet in twitter_df['clean_text'].tolist()) # Store in dataframe output = pd.DataFrame(output) output.columns = ['neg', 'neu', 'pos', 'compound'] output = output.reset_index(drop=True) # Concatenate with twitter dataframe sentiment_df_twitter = pd.concat([twitter_df, output], axis=1) sentiment_df_twitter.to_csv('/Users/Hackathon/CopenhagenHack/Data/' + os.path.basename(input_file).split('_')[0] + '_sentiment_twitter.csv')
time_binary = Parallel(n_jobs=num_cores)(delayed(time_calc)(i) for i in inputs) distance_binary = pd.DataFrame(distance_binary) time_binary = pd.DataFrame(time_binary) time_binary.reset_index(inplace=True) time_binary.drop('index', axis=1, inplace=True) # if observation took place before spray, zero out time # else return elapsed time between spray and observation for col in time_binary.columns: time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x) # https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/ time_binary.columns = distance_binary.columns time_binary_backup = time_binary.copy() distance_binary_backup = distance_binary.copy() time_tp = time_binary.transpose() distance_tp = distance_binary.transpose() binary = pd.merge(distance_tp, time_tp, how='inner', left_index=True, right_index=True, suffixes=('_d', '_t')) binary.shape