Python Parallel.columns Examples, joblib.Parallel.columns Python Examples

Example #1

0

Show file

File: bootstrap_functions.py Project: davidbestue/encoding

def bootstrap_reconstruction(testing_activity, targets, iterations, WM, WM_t, Inter, region, condition, subject, ref_angle=180):
    Reconstructions_boots=[]
    for n_rep in range(iterations):
        time_rec_boot_start=time.time()
        indexes_boots = np.random.randint(0,len(targets), len(targets))  #bootstraped indexes for reconstruction
        ### make the reconstryctions and append them
        targets_boot = targets[indexes_boots]
        signal_boots = testing_activity[indexes_boots, :, :] 
        signal_boots_paralel =[ signal_boots[:, i, :] for i in range(nscans_wm)]
        
        Reconstructions_boot = Parallel(n_jobs = numcores)(delayed(Representation)(signal, targets_boot, WM, WM_t, ref_angle=180, plot=False, intercept=Inter)  for signal in signal_boots_paralel)    #### reconstruction standard (paralel)
        Reconstructions_boot = pd.concat(Reconstructions_boot, axis=1) #mean of all the trials
        Reconstructions_boot.columns =  [str(i * TR) for i in range(nscans_wm)] #column names
        Reconstructions_boots.append(Reconstructions_boot) #append the reconstruction (of the current iteration)
        time_rec_boot_end = time.time() #time
        time_rec_boot = time_rec_boot_end - time_rec_boot_start
        print('boot_' + str(n_rep) + ': ' +str(time_rec_boot) ) #print time of the reconstruction shuffled
        
    ### Get just the supposed target location
    df_boots=[]
    for i in range(len(Reconstructions_boots)):
        n = Reconstructions_boots[i].iloc[ref_angle*2, :] #around the ref_angle (x2 beacuse now we have 720 instead of 360)
        n = n.reset_index()
        n.columns = ['times', 'decoding']
        n['decoding'] = [sum(Reconstructions_boots[i].iloc[:, ts] * f2(ref_angle)) for ts in range(len(n))] #population vector method (scalar product)
        n['times']=n['times'].astype(float)
        n['region'] = region
        n['subject'] = subject
        n['condition'] = condition
        df_boots.append(n) #save thhis
    
    ##
    df_boots = pd.concat(df_boots)    #same shape as the decosing of the signal
    return df_boots

Example #2

0

Show file

File: main_lr.py Project: neikusc/Kaggle_Driver_Telematics_Analysis

def create_training_data():
  num_cores = 8

  # getting total number of trips
  list_of_files = [[folder, f.replace('.csv','')] for folder in os.listdir('drivers') if 'DS_Store' not in folder
                 for f in os.listdir('drivers/'+folder) if '.csv' in f]

  raw_data = Parallel( n_jobs=num_cores )(delayed(create_attributes)(i) for i in list_of_files)
  raw_data = pd.DataFrame(raw_data)
  raw_data.columns = ['driver_trip','trip_time','total_distance','skyway_distance','avg_speed','std_speed',
                      'avg_speed_up','avg_speed_down',
                      'avg_acc','std_acc','avg_turn','std_turn','standing_time','standing_speed']
  # save to file for later training
  raw_data.to_csv('training_set.csv', index=False)
  return raw_data

Example #3

0

Show file

    return new_image


def ImageLoad(i):
    img = load_image(i, im_sz=224)
    Petid = i.split('.')[0]
    Petid = Petid.split('/')[2]
    return img, Petid


images = os.listdir(data_dir + '/train_images/')
images = [i for i in images if 'jpg' in i]
imgs = Parallel(n_jobs=-1)(delayed(ImageLoad)(data_dir + '/train_images/' + i)
                           for i in images)
imgs = pd.DataFrame(imgs)
imgs.columns = np.array(['ndarray', 'PetId'])
img = list(imgs['ndarray'])

img = np.array(img)
PetID = imgs['PetId']

###############Dont run Always
TrainMetaData = pd.read_csv('Results/TrainMetaData.csv')
TrainSentiment = pd.read_csv('Results/TrainSentiment.csv')

####### Section 4: Reordering the Data Set
Data_in = pd.read_csv('Data/train.csv')
Data1 = Data_in[[
    'Age',
    'MaturitySize',
    'FurLength',

Example #4

0

Show file

File: fitOnRates.py Project: winstonbecker/Ago-Fitting-Scripts

                           VariantTable['variant_ID'][i],
                           BindingSeries,
                           np.array(Times),
                           concentrations,
                           plotLocation,
                           VariantTable[VariantTable['variant_ID'] ==
                                        VariantTable['variant_ID'][i]].iloc[0,
                                                                            3],
                           normalization,
                           numBootstraps=nBootstraps,
                           plotBootstrap=Plot)
                                  for i in range(len(VariantTable)))
    results = pd.DataFrame(results)
    results.columns = ('variant_ID', 'kobs', 'kobs_err', 'fmin', 'fminerror',
                       'fmax', 'fmaxerror', 'rsquared', 'ier', 'rmse',
                       'Kon_50', 'Kon_2p5', 'Kon_97p5', 'fmin_50', 'fmin_2p5',
                       'fmin_97p5', 'fmax_50', 'fmax_2p5', 'fmax_97p5',
                       'nClusters', 'fmax_values')
    results = pd.merge(VariantTable, results, on='variant_ID')
    results.to_csv(figSaveLocPrefix + "fitOnRates.csv", index=False)
else:
    # Non-parallelized version:
    VariantTable[[
        'variant_ID', 'kobs', 'kobs_err', 'fmin', 'fminerror', 'fmax',
        'fmaxerror', 'rsquared', 'ier', 'rmse', 'Kon_50', 'Kon_2p5',
        'Kon_97p5', 'fmin_50', 'fmin_2p5', 'fmin_97p5', 'fmax_50', 'fmax_2p5',
        'fmax_97p5', 'nClusters', 'fmax_values'
    ]] = VariantTable['variant_ID'].apply(lambda x: fitkonWrapperBootstrap(
        x,
        BindingSeries,
        np.array(Times),

Example #5

0

Show file

File: wnv_training.py Project: mjschillawski/west_nile_virus

    time_binary = pd.DataFrame(time_binary)

    time_binary.reset_index(inplace=True)
    time_binary.drop('index', axis=1, inplace=True)

    # if observation took place before spray, zero out time
    # else return elapsed time between spray and observation

    print('negating sprays after traps...')

    for col in time_binary.columns:
        time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x)
    print('done')

    # https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/
    time_binary.columns = distance_binary.columns

    time_binary_backup = time_binary.copy()
    distance_binary_backup = distance_binary.copy()

    time_tp = time_binary.transpose()
    distance_tp = distance_binary.transpose()

    time_tp = time_binary.transpose()
    distance_tp = distance_binary.transpose()

    def CalculateDistance(i):
        distances = []
        if i % 500 == 0:
            print('evaluating time binaries ' + str(i))
        d = i

Example #6

0

Show file

        ]
        # print ("FOS_list:",FOS_list)
        value = [value[i] * FOS_list[i] for i in range(len(value))]
        out.append(value)
    return out


adjusted_scores = Parallel(n_jobs=-1)(
    delayed(get_adjusted_motif_score)(motif_score, bw_values, top_n)
    for motif_score in score_list_A)


def set_col_names(motifs, top_n, label):
    out = []
    for i in motifs:
        for j in range(top_n):
            out.append("%s_%s_%s" % (label, i, j))
    return out


## get feature table
adjusted_scores = np.array(adjusted_scores)
adjusted_scores = np.swapaxes(adjusted_scores, 0, 1)
adjusted_scores = adjusted_scores.reshape(
    (len(high + low), top_n * len(motifs)))
adjusted_scores = pd.DataFrame(adjusted_scores)
adjusted_scores.columns = set_col_names(motifs, top_n, "motif_footprint_score")
adjusted_scores.index = high + low
df = pd.concat([adjusted_scores, data], axis=1)
df.to_csv("ML_data.csv")

Example #7

0

Show file

def coloc_sim(data, radius=3, min_count=5, n_cores=1, copy=False):
    """Calculate pairwise gene colocalization similarity with the cross L function.

    Parameters
    ----------
    adata : AnnData
        Anndata formatted spatial data.
    radius : int
        Max radius to search for neighboring points, by default 3
    min_count : int
        Minimum points needed to be eligible for analysis.
    Returns
    -------
    adata : AnnData
        .uns['coloc_sim']: Pairwise gene colocalization similarity within each cell formatted as a long dataframe.
    """
    adata = data.copy() if copy else data

    # Filter points and counts by min_count
    counts = adata.to_df()

    # Helper function to apply per cell
    def cell_coloc_sim(p, g_density, name):

        # Get xy coordinates
        xy = p[["x", "y"]].values

        # Get neighbors within fixed outer_radius for every point
        nn = NearestNeighbors(radius=radius).fit(xy)
        distances, point_index = nn.radius_neighbors(xy, return_distance=True)

        # Enumerate point-wise gene labels
        gene_index = p["gene"].reset_index(
            drop=True).cat.remove_unused_categories()

        # Convert to adjacency list of points, no double counting
        neighbor_pairs = []
        for g1, neighbors, n_dists in zip(gene_index.values, point_index,
                                          distances):
            for g2, d in zip(neighbors, n_dists):
                neighbor_pairs.append([g1, g2, d])

        # Calculate pair-wise gene similarity
        neighbor_pairs = pd.DataFrame(neighbor_pairs,
                                      columns=["g1", "g2", "p_dist"])

        # Keep minimum distance to g2 point
        neighbor_pairs = neighbor_pairs.groupby(["g1", "g2"
                                                 ]).agg("min").reset_index()
        neighbor_pairs.columns = ["g1", "g2", "point_dist"]

        # Map to gene index
        neighbor_pairs["g2"] = neighbor_pairs["g2"].map(gene_index)

        # Count number of points within distance of increasing radius
        r_step = 0.5
        expected_counts = [
            lambda dists: (dists <= r).sum()
            for r in np.arange(r_step, radius + r_step, r_step)
        ]
        metrics = (neighbor_pairs.groupby(["g1", "g2"]).agg({
            "point_dist":
            expected_counts
        }).reset_index())

        # Colocalization metric: max of L_ij(r) for r <= radius
        g2_density = g_density.loc[metrics["g2"].tolist()].values
        metrics["sim"] = ((metrics["point_dist"].divide(
            g2_density * np.pi, axis=0)).pow(0.5).max(axis=1))
        metrics["cell"] = name

        # Ignore self colocalization
        # metrics = metrics.loc[metrics["g1"] != metrics["g2"]]

        return metrics[["cell", "g1", "g2", "sim"]]

    # Only keep genes >= min_count in each cell
    gene_densities = []
    counts.apply(lambda row: gene_densities.append(row[row >= min_count]),
                 axis=1)
    # Calculate point density per gene per cell
    gene_densities /= adata.obs["cell_area"]
    gene_densities = gene_densities.values

    # TODO dask
    cell_metrics = Parallel(n_jobs=n_cores)(delayed(cell_coloc_sim)(
        get_points(adata,
                   cells=g_density.name,
                   genes=g_density.index.tolist(),
                   asgeo=True),
        g_density,
        g_density.name,
    ) for g_density in tqdm(gene_densities))

    cell_metrics = pd.concat(cell_metrics)
    cell_metrics.columns = cell_metrics.columns.get_level_values(0)

    # Make symmetric (Lij = Lji)
    cell_metrics["pair"] = cell_metrics.apply(
        lambda row: "-".join(sorted([row["g1"], row["g2"]])), axis=1)
    cell_symmetric = cell_metrics.groupby(["cell", "pair"]).mean()

    # Retain gene pair names
    cell_symmetric = (cell_metrics.set_index(["cell", "pair"]).drop(
        "sim", axis=1).join(cell_symmetric).reset_index())

    # Aggregate across cells
    coloc_agg = cell_symmetric.groupby(["pair"])["sim"].mean().to_frame()
    coloc_agg = (coloc_agg.join(
        cell_symmetric.set_index("pair").drop(
            ["sim", "cell"], axis=1)).reset_index().drop_duplicates())

    # Save coloc similarity
    cell_metrics[["cell", "g1", "g2", "pair"]].astype("category", copy=False)
    coloc_agg[["g1", "g2", "pair"]].astype("category", copy=False)
    adata.uns["coloc_sim"] = cell_metrics
    adata.uns["coloc_sim_agg"] = coloc_agg

    return adata if copy else None

Example #8

0

Show file

# structResultsPath="~/Tesis/rriPredMethod/data/bench_joan_2411/bipspi_v2/bench5_and_bench_2411/results/struct_2/"

seqPrefixes = getPrefixes(seqResultsPath)
mixedPrefixes = getPrefixes(mixedResultsPath)
structPrefixes = getPrefixes(structResultsPath)

sharedPrefixes = seqPrefixes.union(mixedPrefixes).union(structPrefixes)

# sharedPrefixes=list(sharedPrefixes)[:10]

results = Parallel(n_jobs=N_JOBS)(delayed(processOnePrefix)(
    prefix, ligOrRec, seqResultsPath, mixedResultsPath, structResultsPath)
                                  for prefix in sharedPrefixes
                                  for ligOrRec in ["lig", "rec"])

results = [elem for elem in results if elem is not None]
results = pd.DataFrame(results)
results.columns = [
    "deltaRoc", "rocSeq", "rocMixed", "rocStruct", "ligOrRec", "prefix"
]
results.sort_values(by=["deltaRoc"], ascending=True, inplace=True)

results = results[results["rocMixed"] > 0.7]

print(results)
# results.to_csv("seqVsMixVsStruct.csv", sep="\t", index=False)

#fname="seqVsMixVsStruct.csv"
#x=  pd.read_csv(fname, header="infer", sep="\t")
#x[ (0.8<x.rocStruct) & (x.rocStruct<0.95) & (x.rocSeq<0.8) & (x.rocSeq< x.rocMixed)].head()

Example #9

0

Show file

    twitter_df['clean_text'] = twitter_df['tweet_text'].map(
        lambda x: cleaner(str(x)))

    # Drop data without geospatial coordinates
    twitter_df = twitter_df[(twitter_df['latitude'] != 0)
                            & (twitter_df['latitude'] != np.NaN)]
    twitter_df = twitter_df.reset_index(drop=True)

    # Set cores
    ncores = multiprocessing.cpu_count() - 1

    # Set sentiment analyser
    sentiment_analyser = vader.SentimentIntensityAnalyzer()

    # Run code
    output = Parallel(n_jobs=ncores)(
        delayed(map_sentiment_vader)(str(tweet), sentiment_analyser)
        for tweet in twitter_df['clean_text'].tolist())

    # Store in dataframe
    output = pd.DataFrame(output)
    output.columns = ['neg', 'neu', 'pos', 'compound']
    output = output.reset_index(drop=True)

    # Concatenate with twitter dataframe
    sentiment_df_twitter = pd.concat([twitter_df, output], axis=1)

    sentiment_df_twitter.to_csv('/Users/Hackathon/CopenhagenHack/Data/' +
                                os.path.basename(input_file).split('_')[0] +
                                '_sentiment_twitter.csv')

Example #10

0

Show file

time_binary = Parallel(n_jobs=num_cores)(delayed(time_calc)(i) for i in inputs)

distance_binary = pd.DataFrame(distance_binary)
time_binary = pd.DataFrame(time_binary)

time_binary.reset_index(inplace=True)
time_binary.drop('index', axis=1, inplace=True)

# if observation took place before spray, zero out time
# else return elapsed time between spray and observation

for col in time_binary.columns:
    time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x)

# https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/
time_binary.columns = distance_binary.columns

time_binary_backup = time_binary.copy()
distance_binary_backup = distance_binary.copy()

time_tp = time_binary.transpose()
distance_tp = distance_binary.transpose()

binary = pd.merge(distance_tp,
                  time_tp,
                  how='inner',
                  left_index=True,
                  right_index=True,
                  suffixes=('_d', '_t'))
binary.shape