Beispiel #1
0
def brute_force_pairs(df_0, df_1, threshold_point=2, n_jobs=-2,tqdm=True):
    work = df_1.groupby('site')
    if tqdm:
        from tqdm import tqdm_notebook as tqdn
        work = tqdn(work,'site')
    arr = []
    for site, df_s in work:

        def work_on(df_t):
            rotation, translation, score = evaluate_match(df_t, df_s, threshold_point=threshold_point)
            determinant = None if rotation is None else np.linalg.det(rotation)
            result = pd.Series({'rotation': rotation, 
                                'translation': translation, 
                                'score': score, 
                                'determinant': determinant})
            return result

        (df_0
         .pipe(ops.utils.gb_apply_parallel, 'tile', work_on,n_jobs=n_jobs)
         .assign(site=site)
         .pipe(arr.append)
        )
        
    return (pd.concat(arr).reset_index()
            .sort_values('score', ascending=False)
            )
Beispiel #2
0
def find_group_cliques(df_input,
                       prefix_length=12,
                       edit_distance=2,
                       gene_id=GENE_ID,
                       n_cores=-2):

    prefixes = df_input['sgRNA'].str[:prefix_length + 1].pipe(list)

    hash_buckets = build_khash(tqdn(prefixes, 'hash'), edit_distance)

    # for parallel distance calculation
    arr = [[x] for x in hash_buckets]

    print('hashed')

    # f = partial(sparse_dist, threshold=edit_distance
    #             ,distance_func=distance_prefix
    #            )

    # print('sparse_dist function initialized')
    # import multiprocessing
    # with multiprocessing.Pool(n_cores) as p:
    #     r = list(tqdn(p.imap(f, arr), 'distance',total=len(arr)))

    from joblib import Parallel, delayed
    results = Parallel(n_cores)(delayed(sparse_dist)(
        bucket, threshold=edit_distance, distance_func=distance_prefix)
                                for bucket in tqdn(arr, 'distance'))

    print('distanced')

    Distance = dict()
    for x in results:
        Distance.update(x)

    sparse_distance = sparse_view(prefixes, Distance)

    selected = maxy_clique_groups(sparse_distance,
                                  df_input[gene_id].pipe(list),
                                  df_input['sgRNAs_per_gene'].pipe(list))
    # xs = [prefixes[i] for i in selected]

    return df_input.iloc[selected]
Beispiel #3
0
def groupby_apply2(df_1, df_2, cols, f):
    """Apply a function `f` that takes two dataframes and returns a dataframe.
    Groups inputs by `cols`, evaluates for each group, and concatenates the result.

    """
    from tqdm import tqdm_notebook as tqdn

    d_1 = {k: v for k, v in df_1.groupby(cols)}
    d_2 = {k: v for k, v in df_2.groupby(cols)}

    arr = []
    for k in tqdn(d_1):
        arr.append(f(d_1[k], d_2[k]))

    return pd.concat(arr)
def applyIJ_parallel(f, arr, n_jobs=-2, backend='threading',tqdm=False, *args, **kwargs):
    """Apply a function that expects 2D input to the trailing two
    dimensions of an array, parallelizing computation across 2D frames. 
    The function must output an array whose shape depends only on the 
    input shape. 
    """
    from joblib import Parallel,delayed

    h, w = arr.shape[-2:]
    reshaped = arr.reshape((-1, h, w))

    if tqdm:
        from tqdm import tqdm_notebook as tqdn
        work = tqdn(reshaped,'frame')
    else:
        work = reshaped

    arr_ = Parallel(n_jobs=n_jobs,backend=backend)(delayed(f)(frame, *args, **kwargs) for frame in work)

    output_shape = arr.shape[:-2] + arr_[0].shape
    return np.array(arr_).reshape(output_shape)
def csv_frame(files_or_search, tqdm=False, **kwargs):
    """Convenience function, pass either a list of files or a 
    glob wildcard search term.
    """
    from natsort import natsorted
    
    def read_csv(f):
        try:
            return pd.read_csv(f, **kwargs)
        except pd.errors.EmptyDataError:
            return None
    
    if isinstance(files_or_search, str):
        files = natsorted(glob(files_or_search))
    else:
        files = files_or_search

    if tqdm:
        from tqdm import tqdm_notebook as tqdn
        return pd.concat([read_csv(f) for f in tqdn(files)], sort=True)
    else:
        return pd.concat([read_csv(f) for f in files], sort=True)
def parallel_levenshtein_group(group, dist_func=None, n_cores=-2):
    remainders = [group[i+1:] for i,_ in enumerate(group)]
    if not dist_func:
        dist_func = Levenshtein.distance
        
    def measure_distances(string,remainder):
        arr = []
        for test_string in remainder:
            d = dist_func(string,test_string)
            if d<2:
                print(string,test_string)
            arr.append(d)
        return arr
    
    from joblib import Parallel, delayed
    results = Parallel(n_cores)(delayed(measure_distances)(*subset) 
                              for subset 
                              in tqdn(zip(group,remainders),total=len(group)))
    distances = []
    for result in results:
        distances.extend(result)
        
    return distances