Example #1
0
def distance_matrix(untrans, trans, minimum_length, steps=10, parallel_compute=True, free_cores=2,
        return_partials=False, gpu=False):

    # initialize the matrix
    d = np.zeros((len(untrans), len(untrans)))

    cpus = max(multiprocessing.cpu_count() - free_cores, 1)

    if parallel_compute:
        out = Parallel(n_jobs=cpus,
                verbose=5)(delayed(distance_matrix_loop_func)(ii,jj,untrans,trans,
                    minimum_length,steps,return_partials,gpu) 
                    for ii in range(len(untrans)) 
                    for jj in range(ii, len(untrans)))
    else:
        out = []
        for ii in range(len(untrans)):
            for jj in range(ii, len(untrans)):
                    out.append(distance_matrix_loop_func(ii,jj,untrans,trans,
                        minimum_length,steps,return_partials,gpu))

    for cell in out:
        d[cell[0], cell[1]] = d[cell[1], cell[0]] = cell[2]

    if return_partials:
        pd = np.zeros((len(untrans), len(untrans)))
        for cell in out:
            pd[cell[0], cell[1]] = pd[cell[1], cell[0]] = cell[3]
        return d, pd
    else:
        return d
Example #2
0
def main_get_data(paths, parallel: bool = False, n_jobs: int = -2, modules_name: list = modules_name,
                  skip_countries: list = [], gsheets_api=None):
    """Get data from sources and export to output folder.

    Is equivalent to script `run_python_scripts.py`
    """
    print("-- Getting data... --")
    skip_countries = [x.lower() for x in skip_countries]
    country_data_getter = CountryDataGetter(paths, skip_countries, gsheets_api)
    if parallel:
        modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(country_data_getter.run)(
                module_name,
            ) for module_name in modules_name
        )
    else:
        modules_execution_results = []
        for module_name in modules_name:
            modules_execution_results.append(country_data_getter.run(
                module_name,
            ))

    modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False]
    # Retry failed modules
    logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})")
    modules_execution_results = []
    for module_name in modules_failed:
        modules_execution_results.append(
            country_data_getter.run(module_name)
        )
    modules_failed_retrial = [m["module_name"] for m in modules_execution_results if m["success"] is False]
    if len(modules_failed_retrial) > 0:
        failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial])
        print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}")
    print_eoe()
Example #3
0
def simulate_one_cv(lm_eye: pd.DataFrame,
                    lm_mouse: pd.DataFrame,
                    cv_run: int,
                    n_jobs: int = 4) -> List[Dict]:
    start = time.time()
    sim_IDs = list(np.arange(1, constants.NUM_SIMS + 1))

    lin_models_eye = [lm_eye] * len(sim_IDs)
    lin_models_mouse = [lm_mouse] * len(sim_IDs)
    save_results = [False] * len(sim_IDs)

    try:
        results = Parallel(n_jobs=n_jobs, backend='loky', verbose=True)(
            delayed(simulate_batch)(sim_ID, lm_s, lm_m, r)
            for sim_ID, lm_s, lm_m, r in zip(sim_IDs, lin_models_eye,
                                             lin_models_mouse, save_results))
    except:
        print(
            f'Failed multiprocessing for single CV simulation. Attempting single core...'
        )
        results = []
        for sim_ID in sim_IDs:
            results.append(simulate_batch(sim_ID, lm_eye, lm_mouse, False))
            print(f'Simulated {sim_ID} of {len(sim_IDs)} in CV run {cv_run}')

    save_simulation_results(results, sim_IDs, cv_run)
    print(
        f'CV run {cv_run} took {round((time.time() - start) / 60, 1)} minutes')

    return results
def load_training_set(video_set,
                      grid_size,
                      bins,
                      skip_val,
                      force_refresh=False):
    """
    Load and process all videos in provided training set.

    video_set: List of integers corresponding to video files (5 char left zero padded).
    """

    if force_refresh:

        # Process in parallel if we need to refresh (much faster)
        videos = Parallel(n_jobs=-1, prefer="threads")(
            delayed(process_video)(i, grid_size, bins, skip_val, force_refresh)
            for i in video_set)

        # Remove None values
        list(filter(None.__ne__, videos))

    else:

        videos = []
        for i in video_set:
            video = process_video(i, grid_size, bins, skip_val, force_refresh)
            if video is not None: videos.append(video)

    return videos
Example #5
0
def lgb_cv_tuning(grid,
                  data,
                  nfold,
                  return_best=is_return_best,
                  parallel=True,
                  **kwargs):
    # Modified implementing parallelism
    print('* Start hyperparameter tuning with {}-fold CV...'.format(nfold))
    print('* Hyperparameter grid:')
    print(params_grid)

    cv_results = []
    all_params = ParameterGrid(grid)
    if parallel:
        # num_cores = multiprocessing.cpu_count()
        print("* Parallel mode activated")
        print("* Number of cores used: ", 50)
        print('* Begin CV')
        cv_results = Parallel(n_jobs=50)(
            delayed(tune_ind_params)(all_params[i], data, nfold)
            for i in range(len(all_params)))
    else:
        for i in range(all_params[i]):
            print('Hyperparemeter set: {}'.format(i))
            print('* Begin CV')
            cv_results.append(tune_ind_params(params, data, nfold))

    if return_best:
        return min(cv_results, key=lambda x: x[1])
    else:
        return cv_results
Example #6
0
def _find_centroid(X, Sx, n_samples, seedlen):
    u = uniformset(X, Sx, n_samples, seedlen)
    # print(u.subs[:u.n_seg], u.n_seg)

    if parallel is True:
        results = Parallel(n_jobs=4)([
            delayed(_find_centroid_wrap)(X, Sx, seedlen, iter1, iter2, u)
            for iter1, iter2 in combinations(range(u.n_seg), 2)
        ])
    else:
        results = []
        for iter1, iter2 in tqdm(combinations(range(u.n_seg), 2),
                                 desc='SearchCentroid'):
            results.append(_find_centroid_wrap(X, Sx, seedlen, iter1, iter2,
                                               u))

    # pp.pprint(results)
    if not results:
        print('fixed sampling')
        s0, s1 = fixed_sampling(X, Sx, seedlen)
        return s0, s1
    centroid = np.argmin([res[0] for res in results])
    # print(results[centroid])
    costMin, seg0, seg1 = results[centroid]
    if costMin == np.inf:
        print('!! --- centroid not found')
        # s0, s1 = fixed_sampling(X, Sx, seedlen)
        # print('fixed_sampling', s0.subs, s1.subs)
        return Regime(), Regime()
    s0, s1 = Regime(), Regime()
    s0.add_segment(seg0[0], seg0[1])
    s1.add_segment(seg1[0], seg1[1])
    # print(s0.n_seg, s1.n_seg)
    # time.sleep(3)
    return s0, s1
Example #7
0
def create_beat_dataset_fixed(metadf, Xmat, tgrid, do_parallel=True, detrend=True):
    if do_parallel:
        from joblib import Parallel, delayed
        bl_list = Parallel(n_jobs=30, verbose=5)(
            delayed(segment_beat)(Xmat[idx], tgrid, alg="christov-aligned", detrend=detrend)
                for idx in range(Xmat.shape[0]))
    else:
        bl_list = []
        for idx in range(Xmat.shape[0]):
            bl_list.append(segment_beat(Xmat[idx], tgrid, alg="christov-aligned", detrend=detrend))

    # go through and determine bad idx (bad splits)
    beat_list = [b for b, _ in bl_list]
    len_list  = [l for _, l in bl_list]
    idx_bad  = np.array([ b.shape[-1] != 100 for b in beat_list ])
    idx_good = np.where(~idx_bad)[0]

    # go through each beat and construct a beat dataframe
    beat_meta, beat_lens = [], []
    for idx in idx_good:
        beat_meta += [metadf.iloc[idx]]*len(beat_list[idx])
        beat_lens.append(len_list[idx])
    beat_list = [beat_list[i] for i in idx_good]

    # stack in to dataframe + data matrix
    beat_metadf = pd.DataFrame(beat_meta)
    beat_metadf.reset_index(inplace=True)
    beat_metadf['beat_len'] = np.concatenate(beat_lens)
    beat_mat = np.row_stack(beat_list)
    beat_mat = np.rollaxis(beat_mat, 0) # transpose s.t. Nbeat x Nchannel x Nsamp
    return beat_metadf, beat_mat
Example #8
0
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs):
    # ned
    ned = NED
    cov = coverage
    if verbose:
        print '  nlp ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  nlp ({0}): calculating scores' .format(label), verbose, False, True, False):

        if n_jobs>1:
            ned_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0,
                          pre_dispatch='n_jobs')(delayed(ned)(disc_clsdict.restrict(ns, True))
                          for ns in names)
            cov_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0,
                          pre_dispatch='n_jobs')(delayed(cov)(disc_clsdict.restrict(ns, False),
                          gold_clsdict.restrict(ns, False)) for ns in names)
        else:
            ned_score = list(); cov_score = list() 
            for ns in names:
               ned_score_ = ned(disc_clsdict.restrict(ns, True))
               cov_score_ = cov(disc_clsdict.restrict(ns, False), gold_clsdict.restrict(ns, False))
               ned_score.append(ned_score_)
               cov_score.append(cov_score_)

 
    # don't replace nan's by 1, but ignore them, unless all values in ned_score
    # are nan
    ned_score, cov_score = np.array(ned_score), np.array(cov_score)
    ned_score, cov_score = aggregate(ned_score, default_score=1), \
                           aggregate(cov_score, default_score=0)
    return np.array(ned_score), np.array(cov_score)
Example #9
0
def process_run(template_k,traindata,testdata,allsubs):

    roi_num = np.unique(parcellation_data[:,template_k])
    roi_num = roi_num[roi_num>0]

    roi_inds = []
    for roi in roi_num:
        roi_inds.append(np.where(parcellation_data[:,template_k] == roi)[0])

    arg_instances = [(k+1,ind,traindata,testdata,allsubs) for k,ind in enumerate(roi_inds)]

    print('starting computing with %i workers\n' % (n_workers), flush=True)
    start_time = time.time()
    if n_workers>1:
        results = Parallel(n_jobs=n_workers, max_nbytes='500M', mmap_mode='r')(map(delayed(process_roi), arg_instances))
    else:
        results=[]
        for args in arg_instances:
            results.append(process_roi(args))
    print('Done (took %.1fs)' % (time.time() - start_time), flush=True)

    print('writing aligned datasets')
    for sub_ind,sub in enumerate(allsubs):
        res = {'R_common':[],'S_stat':[]}
        for roi_res in results:
            res['R_common'].append(roi_res[0][sub_ind])
            res['S_stat'].append(roi_res[1][sub_ind])
        dd.io.save(testdata['target'][sub][0],res, compression=None)

    print('')
Example #10
0
def read_video_sample(vid_files,
                      fid,
                      cam_range,
                      calib_file,
                      read_dist=True,
                      read_parallel=False):
    # read calib
    calib_path = os.path.join(os.path.dirname(vid_files[0]), calib_file)
    calib = json_load(calib_path)

    K_list = [np.array(calib['K']['cam%d' % cid]) for cid in cam_range]
    M_list = [
        np.linalg.inv(np.array(calib['M']['cam%d' % cid])) for cid in cam_range
    ]
    if read_dist:
        dist_list = [
            np.array(calib['dist']['cam%d' % cid]) for cid in cam_range
        ]

    # read image
    img_list = list()
    if read_parallel:
        img_list = Parallel(n_jobs=len(cam_range))(
            delayed(read_vid_frame)(vid, fid) for vid in vid_files)

    else:
        for vid in vid_files:
            img_list.append(read_vid_frame(vid, fid))

    if read_dist:
        return img_list, K_list, M_list, dist_list
    return img_list, K_list, M_list
Example #11
0
def main(file_path,
         output_folder,
         file_type,
         download_n_files,
         max_size=None,
         n_jobs=1):
    df = pds.read_csv(file_path, sep=";").sample(frac=1)

    # naively filter the df to get only the desired file_type
    df = df[df.format == file_type]
    if download_n_files:
        df = df.iloc[:download_n_files]
    print(f"There are {len(df)} resources of type {file_type}")
    urls = df["url"].values
    resource_ids = df["id"].values
    dataset_ids = df["dataset.id"].values
    new_ids = dataset_ids + "--" + resource_ids
    organizations = df["dataset.organization"].fillna("NA").apply(
        lambda x: unidecode.unidecode(get_valid_filename(x))).values
    assert len(urls) == len(new_ids)

    if n_jobs > 1:
        succes_downloaded = Parallel(n_jobs=n_jobs)(
            delayed(downloader)(url, id, org, output_folder, file_type,
                                max_size)
            for url, id, org in tqdm(list(zip(urls, new_ids, organizations))))
    else:
        succes_downloaded = []
        for url, id, org in tqdm(list(zip(urls, new_ids, organizations))):
            succes_downloaded.append(
                downloader(url, id, org, output_folder, file_type, max_size))
    print(
        f"I successfully downloaded {sum(succes_downloaded)} of {len(succes_downloaded)} files"
    )
Example #12
0
def find_closest_auto(demofile, new_xyz):
    if args.parallel:
        from joblib import Parallel, delayed
    demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()]
    keys = demofile.keys()
    if args.parallel:
        costs = Parallel(n_jobs=3, verbose=100)(
            delayed(registration_cost)(demo_cloud, new_xyz)
            for demo_cloud in demo_clouds)
    else:
        costs = []
        for (i, ds_cloud) in enumerate(demo_clouds):
            costs.append(registration_cost(ds_cloud, new_xyz))
            print "completed %i/%i" % (i + 1, len(demo_clouds))

    print "costs\n", costs
    if args.show_neighbors:
        nshow = min(5, len(keys))
        import cv2, rapprentice.cv_plot_utils as cpu
        sortinds = np.argsort(costs)[:nshow]
        near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds]
        bigimg = cpu.tile_images(near_rgbs, 1, nshow)
        cv2.imshow("neighbors", bigimg)
        print "press any key to continue"
        cv2.waitKey()

    ibest = np.argmin(costs)
    return keys[ibest]
Example #13
0
def extract_patches(img, offsets, patch_size, extract_batch_parallel=False):
    img = img.permute(0, 3, 1, 2)

    num_patches = offsets.shape[1]
    batch_size = img.shape[0]

    # I pad the images with zeros for the cases that a part of the patch falls outside the image
    pad_const = int(patch_size[0].item() / 2)
    pad_func = torch.nn.ConstantPad2d(pad_const, 0.0)
    img = pad_func(img)

    # Add the pad_const to the offsets, because everything is now shifted by pad_const
    offsets = offsets + pad_const

    all_patches = []

    # Extracting in parallel is more expensive than doing it sequentially. This I left it in here
    if extract_batch_parallel:
        num_jobs = min(os.cpu_count(), batch_size)
        all_patches = Parallel(n_jobs=num_jobs)(
            delayed(_extract_patches_batch)(b, img, offsets, patch_size, num_patches) for b in range(batch_size))

    else:
        # Run sequentially over the elements in the batch
        for b in range(batch_size):
            patches = _extract_patches_batch(b, img, offsets, patch_size, num_patches)
            all_patches.append(patches)

    return torch.stack(all_patches)
Example #14
0
def read_exif_data(images):

    if use_joblib:

        from joblib import Parallel, delayed
        # results = Parallel(n_jobs=n_threads)(delayed(add_exif_data)(im) for im in images[0:10])
        results = Parallel(n_jobs=n_threads)(delayed(add_exif_data)(im)
                                             for im in images)

    if n_threads == 1:

        results = []
        for im in images:
            results.append(add_exif_data[im])

    else:

        if use_threads:
            pool = ThreadPool(n_threads)
        else:
            pool = Pool(n_threads)

        results = list(pool.map(add_exif_data, images))

    return results
def auto_choose(actionfile, new_xyz, nparallel=-1):
    """
    @param demofile: h5py.File object
    @param new_xyz : new rope point-cloud
    @nparallel     : number of parallel jobs to run for tps cost calculaion.
                     If -1 only 1 job is used (no parallelization).
    
    @return          : return the name of the segment with the lowest warping cost.
    """
    if not nparallel == -1:
        from joblib import Parallel, delayed
        nparallel = min(nparallel, 8)

    demo_data = actionfile.items()

    if nparallel != -1:
        before = time.time()
        redprint("auto choose parallel with njobs = %d"%nparallel)
        costs  = Parallel(n_jobs=nparallel, verbose=0)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data)
        after  = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        redprint("auto choose sequential..")
        for i, ddata in enumerate(demo_data):
            costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz))
            print(("tps-cost completed %i/%i" % (i + 1, len(demo_data))))

    ibest = np.argmin(costs)
    redprint ("auto choose returning..")
    return demo_data[ibest][0]
    def analysis(self, permute=False):
        """
        Classify based an iteratively increasing the number of features (electrodes) included in the model. Starts with
        the single best electrode (N=1) and increase until N = the number of electrodes.

        Note: permute is not used in this analysis, but kept to match the same signature as super.
        """
        if self.subject_data is None:
            print('%s: compute or load data first with .load_data()!' % self.subject)

        # Get recalled or not labels
        if self.recall_filter_func is None:
            print('%s classifier: please provide a .recall_filter_func function.' % self.subject)
        y = self.recall_filter_func(self.subject_data)

        # zscore the data by session
        x = self.zscore_data()

        # create the classifier
        classifier = LogisticRegression(C=self.C, penalty=self.norm, solver='liblinear')

        # create .num_rand_splits of cv_dicts
        cv_dicts = [self._make_cross_val_labels() for _ in range(self.num_rand_splits)]

        # run permutations with joblib
        f = _par_compute_and_run_split
        if self.use_joblib:
            aucs = Parallel(n_jobs=12, verbose=5)(delayed(f)(cv, classifier, x, y) for cv in cv_dicts)
        else:
            aucs = []
            for cv in tqdm(cv_dicts):
                aucs.append(f(cv, classifier, x, y))

        # store results
        self.res['auc_x_n'] = np.stack(aucs)
def _extract_multi_patches_batch(b,
                                 imgs,
                                 offsets,
                                 patch_size,
                                 num_patches,
                                 samples_index,
                                 scales,
                                 extract_patch_parallel=False):
    """ Extract patches for a single batch. This function can be called in a for loop or in parallel.
        This functions returns a tensor of patches of size [num_patches, channels, width, height] """
    patches = []

    # Extracting in parallel is more expensive than doing it sequentially. This I left it in here
    if extract_patch_parallel:
        num_jobs = min(os.cpu_count(), num_patches)
        patches = Parallel(n_jobs=num_jobs)(delayed(_extract_multi_patch)(imgs[
            samples_index[b, p]][b], offsets[b, p], patch_size)
                                            for p in range(num_patches))

    else:
        # Run extraction sequentially
        for p in range(num_patches):
            s = samples_index[b, p]
            patch = _extract_multi_patch(imgs[s][b], offsets[b, p], patch_size)
            # print("Extract patch from image scale %d"%s)
            # print("offset ", offsets[b, p])
            # print("img size ", imgs[s][b].shape)
            # showPatch(patch, imgs[s][b])
            patches.append(patch)

    return torch.stack(patches)
Example #18
0
    def evaluate_new_feature(self, prev_subset: list, new_feature, X_f: dict,
                             X_t: dict, y: np.array) -> float:
        A = prev_subset + [new_feature]
        scores = list()
        if self.n_jobs > 1:
            scores = Parallel(n_jobs=self.n_jobs)(
                delayed(self.score_function)(
                    A=A,
                    X_f=result['train']['transformed'],
                    X_f_test=result['test']['transformed'],
                    X_t=result['train']['plain'],
                    X_t_test=result['test']['plain'],
                    y=result['train']['target'],
                    y_test=result['test']['target'],
                    decision_function=clone(self.decision_function))
                for result in (split_dataset(X_t, X_f, y, self.seeds[i], 1 -
                                             self.train_share)
                               for i in range(self.n_cv_ffs)))
        else:
            for i in range(self.n_cv_ffs):
                result = split_dataset(X_t, X_f, y, self.seeds[i],
                                       1 - self.train_share)

                scores.append(
                    self.score_function(
                        A=A,
                        X_f=result['train']['transformed'],
                        X_f_test=result['test']['transformed'],
                        X_t=result['train']['plain'],
                        X_t_test=result['test']['plain'],
                        y=result['train']['target'],
                        y_test=result['test']['target'],
                        decision_function=self.decision_function))

        return float(np.mean(scores))
Example #19
0
def basic_compute_loop(compute_function,
                       looper,
                       run_parallel=True,
                       debug=None):
    """
	Canonical form of the basic compute loop.
	!!! remove this from contacts.py when it works
	"""
    #---send the frame as the debug argument
    if debug != None and debug != False:
        fr = debug
        incoming = compute_function(**looper[fr])
        import ipdb
        ipdb.set_trace()
        sys.quit()
    start = time.time()
    if run_parallel:
        incoming = Parallel(n_jobs=8, verbose=10 if debug else 0)(
            delayed(compute_function, has_shareable_memory)(**looper[ll])
            for ll in framelooper(len(looper), start=start))
    else:
        incoming = []
        for ll in framelooper(len(looper)):
            incoming.append(compute_function(**looper[ll]))
    return incoming
Example #20
0
    def data(self):
        '''Do I need to worry about intake caching?
        
        Data will be dataframes for csvs and 
        Datasets for netcdf files.
        '''

        if not hasattr(self, '_data'):

            if self.parallel:
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.data_by_dataset)(dataset_id)
                    for dataset_id in self.dataset_ids)
            else:
                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.data_by_dataset(dataset_id))


#             if downloads is not None:
            dds = {dataset_id: dd for (dataset_id, dd) in downloads}
            #             else:
            #                 dds = None

            self._data = dds

        return self._data
Example #21
0
def main():
    parser = argparse.ArgumentParser(description='Register & align images')
    parser.add_argument('filenames',nargs='+',help='List of target files to register. Images are aligned to first in list.')
    parser.add_argument('-odir',metavar='outdir',required=True,type=str,help='Output directory for files.')
    parser.add_argument('-m',metavar='method',choices=('point','extended'),default='extended',help='Specify alignment method (point or extended); default=extended.')
    parser.add_argument('-xy',nargs=2,type=float,default=None,help='Specify approximate "x y" pixel coordinate of object to centroid on.  Required for point mode; useful for extended mode (default=center of image).')
    parser.add_argument('-box',nargs=2,type=int,default=None,help='Specify box size (w h) to restrict alignment search.  Useful for both point & extended modes (default=full size of array).')
    parser.add_argument('--c',action='store_true',help='Clobber (overwrite) on output')
    parser.add_argument('-njobs',type=int,default=1,help='Process images in parallel. "-1" is all CPUs (default=1).')
    
    args = parser.parse_args()

    if args.m == 'point' and args.xy is None:
        parser.error("-m point requires -xy coordinate")

    # create output directory
    if args.odir not in ['','.']:
        makedirs(args.odir,exist_ok=True)

    # align all images to first filename
    ref = args.filenames[0]
    align = args.filenames[1:]

    imref = partial(register,ref=ref,outdir=args.odir,
                    method=args.m,center=args.xy,size=args.box,
                    overwrite=args.c)
    
    outfiles = Parallel(n_jobs=args.njobs,verbose=11)(delayed(imref)(toshift=a) for a in align)

    # Write ref to outdir
    refnew = os.path.join(args.odir,os.path.basename(ref))
    copy(ref,refnew)

    outfiles.append(refnew)
    print('Wrote %i files to %s' % (len(outfiles), args.odir))
Example #22
0
    def meta(self):
        
        if not hasattr(self, '_meta'):
            
            if self.parallel:
            
                # get metadata for datasets
                # run in parallel to save time
                num_cores = multiprocessing.cpu_count()
                downloads = Parallel(n_jobs=num_cores)(
                    delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
                )
                
            else:

                downloads = []
                for dataset_id in self.dataset_ids:
                    downloads.append(self.meta_by_dataset(dataset_id))

            # make dict from individual dicts
            from collections import ChainMap
            meta = dict(ChainMap(*downloads)) 

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(meta, orient='index', 
                                                columns=['database','download_url'] \
                                                + self.columns + ['variable names'])
           
        return self._meta       
def find_closest_auto(demofile, new_xyz):
    if args.parallel:
        from joblib import Parallel, delayed
    demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()]
    keys = demofile.keys()
    if args.parallel:
        costs = Parallel(n_jobs=3,verbose=100)(delayed(registration_cost)(demo_cloud, new_xyz) for demo_cloud in demo_clouds)
    else:
        costs = []
        for (i,ds_cloud) in enumerate(demo_clouds):
            costs.append(registration_cost(ds_cloud, new_xyz))
            print "completed %i/%i"%(i+1, len(demo_clouds))
    
    print "costs\n",costs
    if args.show_neighbors:
        nshow = min(5, len(keys))
        import cv2, rapprentice.cv_plot_utils as cpu
        sortinds = np.argsort(costs)[:nshow]
        near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds]
        bigimg = cpu.tile_images(near_rgbs, 1, nshow)
        cv2.imshow("neighbors", bigimg)
        print "press any key to continue"
        cv2.waitKey()
        
    ibest = np.argmin(costs)
    return keys[ibest]
Example #24
0
    def fit_discover(self, D, return_tids=False):
        """fit LCM on the transactional database, and return the set of
        closed itemsets in this database, with respect to the minium support
        Different from ``fit_transform``, see the `Returns` section below.
        Parameters
        ----------
        D : pd.Series or Iterable
            The input transactional database
            Where every entry contain singular items
            Items must be both hashable and comparable
        return_tids: bool
            Either to return transaction ids along with itemset.
            Default to False, will return supports instead
        Returns
        -------
        pd.DataFrame
            DataFrame with the following columns
                ==========  =================================
                itemset     a `tuple` of co-occured items
                support     frequence for this itemset
                ==========  =================================
            if `return_tids=True` then
                ==========  =================================
                itemset     a `tuple` of co-occured items
                tids        a bitmap tracking positions
                ==========  =================================
        Example
        -------
        from skmine.preprocessing import LCM
        D = [[1, 2, 3, 4, 5, 6], [2, 3, 5], [2, 5]]
        LCM(min_supp=2).fit_discover(D)
             itemset  support
        0     (2, 5)        3
        1  (2, 3, 5)        2
        LCM(min_supp=2).fit_discover(D, return_tids=True)  # doctest: +SKIP
             itemset       tids
        0     (2, 5)  [0, 1, 2]
        1  (2, 3, 5)     [0, 1]
        """

        self._fit(D)

        empty_df = pd.DataFrame(columns=['itemset', 'tids'])

        # reverse order of support
        supp_sorted_items = sorted(self.item_to_tids.items(),
                                   key=lambda e: len(e[1]),
                                   reverse=True)

        dfs = Parallel(n_jobs=self.n_jobs, prefer='processes')(
            delayed(self._explore_item)(item, tids)
            for item, tids in supp_sorted_items)

        dfs.append(empty_df)  # make sure we have something to concat
        df = pd.concat(dfs, axis=0, ignore_index=True)
        if not return_tids:
            df.loc[:, 'support'] = df['tids'].map(len).astype(np.uint32)
            df.drop('tids', axis=1, inplace=True)
        return df
Example #25
0
 def calc_fitness(self,use_parallel=False):
     if use_parallel:
         errors = Parallel(n_jobs=num_cores)(delayed(self.fitness)(i) for i in tqdm(self.poolarray))
     else:
         errors = []
         for i in self.poolarray:
             errors.append(self.fitness(i))
     self.fitnessmap = errors
Example #26
0
def run_jobs(jobs,
             joblib=True,
             n_jobs=4,
             chunks=1,
             chunk_callback=None,
             *args,
             **kwargs):
    if len(jobs) == 0:
        return None, None

    if joblib:
        jobs = [delayed(job)() for job in jobs]

        chunk_size = max(1, len(jobs) // chunks)
        chunks = [
            jobs[i:i + chunk_size] for i in range(0, len(jobs), chunk_size)
        ]

        out = []
        for chunk in chunks:
            chunk_out = Parallel(n_jobs=n_jobs, *args, **kwargs)(chunk)
            if chunk_callback is not None:
                ret = chunk_callback(chunk_out,
                                     args=[job[0].args for job in chunk],
                                     kwargs=[job[0].keywords for job in chunk])
                out.append((chunk_out, ret))
            else:
                out.append((chunk_out, ))
    else:
        out = []
        # create chunks
        nr_chunks = chunks
        chunk_size = max(1, len(jobs) // chunks)
        chunks = [
            jobs[i:i + chunk_size] for i in range(0, len(jobs), chunk_size)
        ]

        for j, chunk in enumerate(chunks):
            chunk_out = []

            for i, job in enumerate(chunk):
                if 'verbose' in kwargs and kwargs['verbose']:
                    print('\r\r Chunk %d / %d' % (j, nr_chunks) +
                          '\n Working on job %d/%d, ' % (i, len(chunk)) +
                          '\n args: %s, \n kwargs: %s' %
                          (', '.join(job.args), ', '.join(
                              [str(tup) for tup in job.keywords.items()])))
                chunk_out.append(job())

            if chunk_callback is not None:
                ret = chunk_callback(chunk_out,
                                     args=[job.args for job in chunk],
                                     kwargs=[job.keywords for job in chunk])
                out.append((chunk_out, ret))
            else:
                out.append((chunk_out, ))

    return list(el[0] for el in zip(*out))
Example #27
0
    def prepare_data(self, midi_paths):
        if self.train_from_scratch:
            midis = Parallel(n_jobs=len(os.sched_getaffinity(0)))(
                delayed(data_augmentation)(midi_paths[midi]) for midi in tqdm.trange(len(midi_paths)))
            midis = [item for sublist in midis for item in sublist]
            # midis = midi_paths
            all_events = Parallel(n_jobs=len(os.sched_getaffinity(0)))(
                delayed(extract_events)(midis[path]) for path in tqdm.trange(len(midis)))
            all_events = list(filter(None, all_events))
            total_events = [item for sublist in all_events for item in sublist]
            dictionary = list(set(total_events))
            self.word2event = dict(zip(range(len(dictionary)), dictionary))
            self.event2word = dict(zip(dictionary, range(len(dictionary))))
            with open(self.dictionary_path, "wb") as file:
                pickle.dump((self.event2word, self.word2event), file)
        # extract events
        all_events = []
        for path in midi_paths:
            events = extract_events(path, self.use_chord)
            all_events.append(events)
        # event to word
        all_words = []
        for events in all_events:
            words = []
            for event in events:

                if event in self.event2word:
                    words.append(self.event2word[event])
                else:
                    # OOV
                    if event.name == 'Note Velocity':
                        # replace with max velocity based on our training data
                        words.append(self.event2word['Note Velocity_21'])
                    else:
                        # something is wrong
                        # you should handle it for your own purpose
                        print('something is wrong! {}'.format(event))
            all_words.append(words)
        # to training data
        self.group_size = 5
        segments = []
        for words in all_words:
            pairs = []
            for i in range(0, len(words) - self.x_len - 1, self.x_len):
                x = words[i:i + self.x_len]
                y = words[i + 1:i + self.x_len + 1]
                pairs.append([x, y])
            pairs = np.array(pairs)
            # abandon the last
            for i in np.arange(0, len(pairs) - self.group_size, self.group_size * 2):
                data = pairs[i:i + self.group_size]
                if len(data) == self.group_size:
                    segments.append(data)
        segments = np.array(segments)
        if self.train_from_scratch:
            self.n_token = len(self.event2word)
            self.load_model()
        return segments
Example #28
0
def main_get_data(parallel: bool = False, n_jobs: int = -2):
    """Get data from sources and export to output folder.
    
    Is equivalent to script `run_python_scripts.py`
    """
    def _get_data_country(module_name):
        country = module_name.split(".")[-1]
        if country.lower() in SCRAPING_SKIP_COUNTRIES:
            logger.info(f"{module_name} skipped!")
            return {
                "module_name": module_name,
                "success": None,
                "skipped": True
            }
        logger.info(f"{module_name}: started")
        module = importlib.import_module(module_name)
        try:
            module.main()
        except Exception as err:
            success = False
            logger.error(f"{module_name}: {err}", exc_info=True)
        else:
            success = True
            logger.info(f"{module_name}: SUCCESS")
        return {
            "module_name": module_name,
            "success": success,
            "skipped": False
        }
    if parallel:
        modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(_get_data_country)(module_name) for module_name in modules_name
        )
    else:
        modules_execution_results = []
        for module_name in modules_name:
            modules_execution_results.append(_get_data_country(module_name))

    modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False]
    # Retry failed modules
    print(f"\n---\n\nRETRIALS ({len(modules_failed)})")
    modules_failed_retrial = []
    for module_name in modules_failed:
        date_str = datetime.now().strftime("%Y-%m-%d %X")
        print(f">> {date_str} - {module_name} - (RETRIAL)")
        module = importlib.import_module(module_name)
        try:
            module.main()
        except Exception as err:
            modules_failed_retrial.append(module)
            logger.error(err, exc_info=True)
            print()

    if len(modules_failed_retrial) > 0:
        print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):")
        print("\n".join([f"* {m}" for m in modules_failed_retrial]))
Example #29
0
 def run(self, parallel=True, combine=True):
     simulations = []
     if parallel:
         simulations = Parallel(n_jobs=-1)(delayed(self.simulate)()
                                           for i in range(self.n_sims))
     else:
         for i in range(self.n_sims):
             simulations.append(self.simulate())
     self.simulations = Simulations(simulations, combine)
     return self
    def fit_Gaussian2D_wrapper(self, PSF_List, scale=5, internal_parallel_flag=False):
        """
        PSF localization using fit_Gaussian2D.

        Parameters
        ----------
        PSF_List: pandas dataframe
            The data frame contains PSFs locations( x, y, frame, sigma)

        scale: int
            The ROI around PSFs is defined using this scale, which is based on their sigmas.

        internal_parallel_flag: bool
            Internal flag for activating parallel computation. Default is True!

        Returns
        -------
        df: pandas dataframe
            The data frame contains PSFs locations ( 'y', 'x', 'frame', 'center_intensity', 'sigma', 'Sigma_ratio') and fitting information.
            fit_params is a list include ('Fit_Amplitude', 'Fit_X-Center', 'Fit_Y-Center', 'Fit_X-Sigma', 'Fit_Y-Sigma',
            'Fit_Bias', 'Fit_errors_Amplitude', 'Fit_errors_X-Center', 'Fit_errors_Y-Center', 'Fit_errors_X-Sigma', 'Fit_errors_Y-Sigma', 'Fit_errors_Bias'].
        """

        if type(PSF_List) is list:
            df_PSF = data_handeling.list2dataframe(feature_position=PSF_List, video=self.video)
        elif type(PSF_List) is pd.core.frame.DataFrame:
            df_PSF = PSF_List
        else:
            raise ValueError('PSF_List does not have correct bin_type')

        self.df2numpy = df_PSF.to_numpy()

        if self.cpu.parallel_active and internal_parallel_flag:
            print('\n---Fitting 2D gaussian with parallel loop---')
            list_df = Parallel(n_jobs=self.cpu.n_jobs, backend=self.cpu.backend, verbose=self.cpu.verbose)(delayed(
                self.fit_2D_gussian_kernel)(i_, scale) for i_ in tqdm(range(self.df2numpy.shape[0])))

        else:
            print('\n---Fitting 2D gaussian without parallel loop---')
            list_df = []
            for i_ in tqdm(range(self.df2numpy.shape[0])):
                tmp = self.fit_2D_gussian_kernel(i_, scale)
                list_df.append(tmp)

        df2numpy = np.asarray(list_df)

        if df2numpy.shape[0] != 0:
            df = pd.DataFrame(data=df2numpy, columns=['y', 'x', 'frame', 'center_intensity', 'sigma', 'Sigma_ratio',
                                                       'Fit_Amplitude', 'Fit_X-Center', 'Fit_Y-Center', 'Fit_X-Sigma',
                                                       'Fit_Y-Sigma', 'Fit_Bias', 'Fit_errors_Amplitude',
                                                       'Fit_errors_X-Center', 'Fit_errors_Y-Center', 'Fit_errors_X-Sigma',
                                                       'Fit_errors_Y-Sigma', 'Fit_errors_Bias'])
        else:
            df = None
        return df
Example #31
0
def custom_query_validation(query, request, request_page):
    global query_appendix
    global total
    global product_appendix

    if product_appendix:
        product_appendix = []

    available = products_with_details(request.user)
    query = list(set(query.split(' ')))
    queryset = Q()
    for q in query:
        query_appendix[q] = 0
        queryset = queryset | Q(details__icontains=q)
    product_details = CleanProductDetails.objects.filter(
        product_id__in=available).filter(queryset)

    if product_details:
        for q in query_appendix:
            for item in product_details:
                if q in item.details:
                    query_appendix[q] += 1

        total = len(product_details)
        Parallel(n_jobs=psutil.cpu_count() * 2,
                 verbose=50,
                 require='sharedmem')(map(delayed(check_similarity),
                                          product_details))
        print('Job Done')
        product_appendix = sorted(product_appendix,
                                  key=itemgetter('similarity'),
                                  reverse=True)
        product_appendix = [item['id'].pk for item in product_appendix]
        print('Sorted')
        start = (settings.PAGINATE_BY * (request_page - 1))
        end = start + (settings.PAGINATE_BY)
        products = product_appendix[start:end]

        results = []
        results = Parallel(
            n_jobs=psutil.cpu_count() * 2,
            verbose=50,
            require='sharedmem',
            backend="threading")(delayed(render_item)(Product.objects.get(
                id=item), request.discounts, request.currency)
                                 for item in products)
        front = [i for i in range((start))]

        results = front + results

        for item in product_appendix[end:]:
            results.append(item)
        return results
    else:
        return []
Example #32
0
def _load_corpus(self, **kwargs):
    """
    Generic loader for corpus or contents
    """
    from .corpus import Corpus
    from .dataset import Dataset
    from . import multi

    # current favourite line in buzz codebase :P
    multiprocess = multi.how_many(kwargs.pop("multiprocess", self.is_parsed))
    to_iter = self.files if isinstance(self, Corpus) else self
    order = {f.path: i for i, f in enumerate(to_iter, start=1)}

    # i would love to only ever use joblib, and therefore just use the first
    # part of these conditionals, but django and joblib don't play nice.
    if multiprocess and multiprocess > 1:
        chunks = np.array_split(to_iter, multiprocess)
        if self.is_parsed:
            delay = (multi.load(x, i, order=order, **kwargs)
                     for i, x in enumerate(chunks))
        else:
            delay = (multi.read(x, i) for i, x in enumerate(chunks))
        loaded = Parallel(n_jobs=multiprocess)(delay)
        # unpack the nested list that multiprocessing creates
        loaded = [item for sublist in loaded for item in sublist]
    else:
        kwa = dict(ncols=120, unit="file", desc="Loading", total=len(self))
        t = tqdm(**kwa) if len(to_iter) > 1 else None
        loaded = list()
        for i, file in enumerate(to_iter, start=1):
            data = file.load(**kwargs) if file.is_parsed else file.read()
            if data is not None:
                if "order" not in data.columns:
                    data["order"] = i
                loaded.append(data)
            _tqdm_update(t)
        _tqdm_close(t)

    # for unparsed corpora, we give a dict of {path: text}
    # this used to be an OrderedDict, but dict order is now guaranteed.
    if not self.is_parsed:
        keys = self.filepaths if self.is_parsed else [
            i.path for i in self.files
        ]
        return dict(sorted(zip(keys, loaded)))

    # for parsed corpora, we merge each file contents into one huge dataframe
    df = pd.concat(loaded, sort=False)

    df["_n"] = range(len(df))
    if kwargs.get("set_data_types", True):
        df = _set_best_data_types(df)
    df = _order_df_columns(df)
    print("\n" * multiprocess)  # not sure if this really helps
    return Dataset(df, reference=df, name=self.name)
Example #33
0
    def extract(self, plot=True):
        # Prep jobs (one per coordinate)
        print("preparing jobs...")
        J = []  # jobs
        for i, sample in self.coords.iterrows():
            coord = np.array([sample.coordZ, sample.coordY, sample.coordX])
            if not pd.isnull(sample.coordZ):
                # job: (path to scan, coordinate, instance shape, coord system 'vox' or 'world')
                J.append([
                    os.path.join(self.src_dir, sample.seriesuid + '.mhd'),
                    coord, config['cube_shape'], self.coordSystem
                ])

        print("extracting and augmenting samples...")
        if self.parallelize:
            num_cores = int(
                np.ceil(
                    min(np.ceil(multiprocessing.cpu_count() * 0.75), len(J))))
            X = Parallel(n_jobs=num_cores)(delayed(self._processJob)(j)
                                           for j in J)
        else:
            X = []
            for job in J:
                try:
                    X.append(self._processJob(job))  # 这一步报错
                except:
                    print("Failed to process sample")

        instances = np.array(
            list(itertools.chain.from_iterable(X))
        )  # each job creates a batch of augmented instances: so collect hem
        print('instance_shape:', instances.shape)
        # Histogram Equalization:
        print("equalizing the data...")
        eq = histEq(instances)
        instances = eq.equalize(instances)
        os.makedirs(self.norm_save_dir, exist_ok=True)
        eq.save(path=os.path.join(self.norm_save_dir, 'equalization.pkl'))

        # -1 1 Normalization
        print("normalizing the data...")
        min_v = np.min(instances)
        max_v = np.max(instances)
        mean_v = np.mean(instances)
        norm_data = np.array([mean_v, min_v, max_v])
        instances = (instances - mean_v) / (max_v - min_v)
        np.save(os.path.join(self.norm_save_dir, 'normalization.npy'),
                norm_data)

        if plot:
            self.plot_sample(instances)

        print("saving the dataset")
        np.save(self.dst_path, instances)
Example #34
0
def preprocess_from_ray_parallel_inference(dirpath, mode, use_parallel=True):
   filenames = os.listdir(os.path.join(dirpath, mode))
   if use_parallel:
       num_cores = multiprocessing.cpu_count()
       preproc_list = Parallel(n_jobs=num_cores)(
           delayed(process_audio_files_inference)(filename, dirpath, mode) for filename in tqdm(filenames))
   else:
       preproc_list=[]
       for filename in tqdm(filenames):
           preproc_list.append(process_audio_files_inference(filename, dirpath, mode))
   return preproc_list
Example #35
0
def basic_compute_loop(compute_function,looper,run_parallel=True,debug=False):
	"""Canonical form of the basic compute loop."""
	start = time.time()
	if run_parallel:
		incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)(
			delayed(compute_function,has_shareable_memory)(**looper[ll]) 
			for ll in framelooper(len(looper),start=start))
	else: 
		incoming = []
		for ll in framelooper(len(looper)):
			incoming.append(compute_function(**looper[ll]))
	return incoming
def auto_choose(actionfile, new_xyz, softmin_k = 1, softmin_alpha = 1, nparallel=-1):
    """
    @param demofile  : h5py.File object
    @param new_xyz   : new rope point-cloud
    @param softmin   : use softmin distribution over first <softmin> demonstrations
                       set to 1 for nearest neighbor
    @param nparallel : number of parallel jobs to run for tps cost calculaion
                       set to -1 for no parallelization
    
    @return          : return the name of the segment with the lowest warping cost.
    """
    if not nparallel == -1:
        from joblib import Parallel, delayed
        nparallel = min(nparallel, 8)

    demo_data = actionfile.items()

    if nparallel != -1:
        before = time.time()
        redprint("auto choose parallel with njobs = %d"%nparallel)
        costs  = Parallel(n_jobs=nparallel, verbose=100)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data)
        after  = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        redprint("auto choose sequential..")
        for i, ddata in enumerate(demo_data):
            costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz))
            print(("tps-cost completed %i/%i" % (i + 1, len(demo_data))))
    
    # use a random draw from the softmin distribution
    demo_costs = zip(costs, demo_data)
    if softmin_k == 1:
        ibest = np.argmin(costs)
        return demo_data[ibest][0]
    best_k_demos = np.asarray(sorted(demo_costs)[:softmin_k])
    best_k_exps = np.exp(-1*softmin_alpha*float(best_k_demos[:, 0]))  #multiply by -1 b/c we're actually min-ing
    if len(best_k_exps) > 1:
        denom = sum(best_k_exps)
    else:
        denom = best_k_exps
    mass_fn = best_k_exps/denom

    draw = random.random()
    for i in range(best_k_demos):
        if draw <= mass_fn[i]:
            ret_val = demo_data[i][0]
            break
        draw -= mass_fn[i]
    
    redprint ("auto choose returning..")
    return ret_val
Example #37
0
	def train(self):
		regressors = []
		if self.parallel:
			regressors = Parallel(n_jobs=-1)(delayed(trainBin)(self.params[b], np.atleast_2d(self.ind).T, self.dep[b],self.indWeights) for b in self.OD.bins)
		else:
			for b in self.OD.bins:
				regressors.append(trainBin(self.params[b],np.atleast_2d(self.ind).T, self.dep[b],self.indWeights))
				#self.svr[b] = SVR(cache_size=1000,kernel='rbf', C=self.params[b]['C'], gamma=self.params[b]['gamma'])
				#self.svr[b].fit(np.array([self.ind]).T,self.dep[b])
				
		
		for i,model in enumerate(regressors):
			self.svr[self.OD.bins[i]] = model
Example #38
0
def run_all(cnf, samples, process_one, finalize_one, finalize_all):
    if len(samples) == 1:
        sample_name, sample_cnf = samples.items()[0]
        run_one(sample_cnf, process_one, finalize_one)
    else:
        results = []
        if cnf.get('parallel'):
            try:
                from joblib import Parallel, delayed
            except ImportError:
                critical(
                    '\nERROR: Joblib not found. You may want samples to be processed '
                    'in parallel, in this case, make sure python joblib intalled. '
                    '(pip install joblib).')
            else:
                for sample_name, sample_cnf in samples.items():
                    sample_cnf['verbose'] = False

                results = Parallel(n_jobs=len(samples)) \
                    (delayed(run_one)(sample_cnf, process_one, finalize_one,
                                      multiple_samples=True)
                        for sample_name, sample_cnf in samples.items())
        else:
            results = []
            for sample_name, sample_cnf in samples.items():
                results.append(
                    run_one(sample_cnf, process_one, finalize_one,
                            multiple_samples=True))

        if samples:
            info('')
            info('*' * 70)
            info('Results for each sample:')
            finalize_all(cnf, samples, results)

    # Cleaning
    for name, data in samples.items():
        work_dirpath = data['work_dir']
        tx_dirpath = join(work_dirpath, 'tx')

        if isdir(tx_dirpath):
            shutil.rmtree(tx_dirpath)

        if not data.get('keep_intermediate') \
                and isdir(work_dirpath):
            shutil.rmtree(work_dirpath)
def auto_choose(demofile, new_xyz, only_original_segments):
    """
    @param demofile:
    @param new_xyz:
    @param only_original_segments: if true, then only the original_segments will be registered with
    @return:
    """
    import pprint

    """Return the segment with the lowest warping cost. Takes about 2 seconds."""
    parallel = True
    if parallel:
        from joblib import Parallel, delayed
    items = demofile.items()
    if only_original_segments:
        #remove all derived segments from items
        print("Only registering with the original segments")
        items = [item for item in items if not "derived" in item[1].keys()]
    unzipped_items = zip(*items)
    keys = unzipped_items[0]
    values = unzipped_items[1]
    ds_clouds, shapes = get_downsampled_clouds(values)
    ds_new = clouds.downsample(new_xyz, 0.01 * DS_SIZE)
    #print 'ds_new_len shape', ds_new.shape
    if parallel:
        before = time.time()
        #TODO: change back n_jobs=12 ?
        costs = Parallel(n_jobs=8, verbose=0)(delayed(registration_cost)(ds_cloud, ds_new) for ds_cloud in ds_clouds)
        after = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        for (i, ds_cloud) in enumerate(ds_clouds):
            costs.append(registration_cost(ds_cloud, ds_new))
            print(("completed %i/%i" % (i + 1, len(ds_clouds))))
            #print(("costs\n", costs))
    ibest = np.argmin(costs)
    print "ibest = ", ibest
    #pprint.pprint(zip(keys, costs, shapes))
    #print keys
    print "best key = ", keys[ibest]
    print "best cost = ", costs[ibest]
    return keys[ibest]
Example #40
0
 def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts',
               minlen=3, drop_gamma=False, n_jobs='auto'):
     '''
     Finds TADs in data with a list of gammas. Returns a pandas DataFrame
     with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on
     the returned object to get coordinates in bed-style format and not in
     coordinates of concatenated genome.
     If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma)
     '''
     raise DeprecationWarning('Will be deprecated or rewritten to use'\
                             'lavaburst: github.com/nezar-compbio/lavaburst')
     if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs
         if segmentation == 'potts':
             n_jobs = 3
         elif segmentation == 'armatus':
             n_jobs = 6
     if ~np.isfinite(data).any():
         print 'Non-finite values in data, substituting them with zeroes'
         data[~np.isfinite(data)] = 0
     Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data)
     f = _calculate_TADs
     if n_jobs >= 1:
         from joblib import Parallel, delayed
         domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)(
                           delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation)
                                                                    for g in gammalist)
     elif n_jobs is None or n_jobs == False or n_jobs == 0:
         domains = []
         for g in gammalist:
             domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation)
             domains.append(domains_g)
     domains = pd.concat(domains, ignore_index=True)
     domains = domains.query('End-Start>='+str(minlen)).copy()
     domains = domains.sort(columns=['Gamma', 'Start', 'End'])
     domains.reset_index(drop=True, inplace=True)
     domains[['Start', 'End']] = domains[['Start', 'End']].astype(int)
     domains[['Start', 'End']] *= self.resolution
     domains = domains[['Start', 'End', 'Score', 'Gamma']]
     if drop_gamma:
         domains.drop('Gamma', axis=1, inplace=True)
     domains = self.genome_intervals_to_chr(domains).reset_index(drop=True)
     return domains
Example #41
0
def main():
    """
    Main function.

    1. Setup logging
    2. Get arguments
    3. Get index
    4. Process files
    5. Write output
    """

    setup_logging()

    logger = logging.getLogger("stats." + __name__)

    args = get_args()

    index = get_index(args)

    logger.warning("Positions not in annotation will be ignored.")

    logger.info("Found " + str(len(args.inputs)) + " input file(s):")
    for input_file in sorted(args.inputs):
        logger.debug(input_file)

    if args.is_parallel:
        stats = Parallel(n_jobs=args.parallel,
                         verbose=100,
                         batch_size=1)(delayed(process_file)(input_file,
                                                             args.type,
                                                             index,
                                                             args.is_parallel)
                                       for input_file in args.inputs)
    else:
        stats = []
        for input_file in args.inputs:
            output_table = process_file(input_file, args.type, index,
                                        args.is_parallel)
            stats.append(output_table)

    write_stats(args.out, stats)
def findPeaks(imgdict, maplist, params, maptype="ccmaxmap", pikfile=True):
	peaktreelist = []
	count = 0

	thresh =    float(params["thresh"])
	bin =       int(params["bin"])
	diam =      float(params["diam"])
	apix =      float(params["apix"])
	olapmult =  float(params["overlapmult"])
	maxpeaks =  int(params["maxpeaks"])
	maxthresh = params["maxthresh"]
	maxsizemult = float(params["maxsize"])
	peaktype =  params["peaktype"]
	msg =       not params['background']
	pixdiam =   diam/apix/float(bin)
	pixrad =    diam/apix/2.0/float(bin)

	numpyVersion = float(numpy.version.version[:3])
	if numpyVersion > 1.7:
		peaktreelist = Parallel(n_jobs=params['nproc'])(delayed(runFindPeaks)(params,
			maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,maxpeaks,maxsizemult,
			msg,bin,peaktype,pixrad,imgdict) for count in range(0,len(maplist)))
	else:
		## backup for AttributeError: 'memmap' object has no attribute 'offset', bug #3322
		peaktreelist = []
		for count in range(0,len(maplist)):
			mappeaktree = runFindPeaks(params,maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,
				maxpeaks,maxsizemult,msg,bin,peaktype,pixrad,imgdict)
			peaktreelist.append(mappeaktree)

	peaktree = mergePeakTrees(imgdict, peaktreelist, params, msg, pikfile)

	#max threshold
	if maxthresh is not None:
		precount = len(peaktree)
		peaktree = maxThreshPeaks(peaktree, maxthresh)
		postcount = len(peaktree)
		#if precount != postcount:
		apDisplay.printMsg("Filtered %d particles above threshold %.2f"%(precount-postcount,maxthresh))

	return peaktree
Example #43
0
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                just_speakers=False,
                root=False,
                note=False,
                print_info=True,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself."""
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())):
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))

    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, 
                                                    multiple_search, multiple_option]):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')
    
    # the options that don't change
    d = {'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, (name, val) in enumerate(search.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    if multiple_queries:
        to_it_over = query
    else:
        to_it_over = search
    for i, (k, v) in enumerate(list(to_it_over.items())):
        if isinstance(v, list):
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        elif isinstance(v, dict):
            vformat = ''
            for kk, vv in v.items():
                if isinstance(vv, list):
                    vv = ', '.join(vv[:5])

                vformat += '\n                     %s: %s' % (kk, vv)
                if len(vv) > 5:
                    vformat += ' ...'
        else:
            try:
                vformat = v.pattern
            except AttributeError:
                vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(to_it_over.keys()) - 1:
            sformat += '\n                   '

    if print_info:
        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple_corpora and not multiple_option:
            corplist = "\n              ".join([i.name for i in corpus[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple_queries:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple_search:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple_option:
            print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat,  message) ))

        elif multiple_speakers:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs['corpus'], 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs['corpus']])

    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))

        return lines

    if not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        from corpkit.interrogation import Interrodict
        idict = Interrodict(out)
        
        if print_info:
            time = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % \
                (time, "'\n         '".join(sorted(out.keys()))))

        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict
    

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    else:
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            try:
                out = pd.concat([r.results for r in res], axis=1)
                out = out.T
                out.index = [i.query['outname'] for i in res]
            except ValueError:
                return None
            # format like normal
            # this sorts subcorpora, which are cls
            out = out[sorted(list(out.columns))]
            # puts subcorpora in the right place
            if not mult_corp_are_subs:
                out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):
            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)):
                out = out.T
        out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \
                      df1_always_df=kwargs.get('df1_always_df'))
        out.query = qlocs

        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('conc') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal and print_info:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            if print_info:
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        if save:
            out.save(save, print_info = print_info)
        return out
Example #44
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    num_nf_errors = logger._num_nf_errors
    success_compilation = compile_aligner(logger)
    if qconfig.test and is_emem_aligner():
        success_compilation = check_emem_functionality(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if qconfig.memory_efficient:
        threads = 1
    else:
        threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \
                                         [x[1] for x in statuses_results_lengths_tuples], \
                                         [x[2] for x in statuses_results_lengths_tuples]
    reports = []

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        if qconfig.draw_plots:
            from . import plotter
            plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        if not qconfig.test and is_emem_aligner():
            logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.')

    return nucmer_statuses, aligned_lengths_per_fpath
Example #45
0
def _bivar_factor_operation(phi1, phi2, operation, n_jobs=1):
    """
    Returns product of two factors.

    Parameters
    ----------
    phi1: factors

    phi2: factors

    operation: M | D
            M: multiplies phi1 and phi2
            D: divides phi1 by phi2
    """
    try:
        from joblib import Parallel, delayed
        use_joblib = True
    except ImportError:
        use_joblib = False

    def err_handler(type, flag):
        raise Exceptions.InvalidValueError(type)

    np.seterrcall(err_handler)
    np.seterr(divide='raise', over='raise', under='raise', invalid='call')

    phi1_vars = list(phi1.variables)
    phi2_vars = list(phi2.variables)
    common_var_list = [var for var in phi1_vars if var in phi2_vars]
    if common_var_list:
        variables = phi1_vars
        variables.extend([var for var in phi2.variables
                         if var not in common_var_list])
        cardinality = list(phi1.cardinality)
        cardinality.extend(phi2.get_cardinality(var) for var in phi2.variables
                           if var not in common_var_list)

        phi1_indexes = [i for i in range(len(phi1.variables))]
        phi2_indexes = [variables.index(var) for var in phi2.variables]
        values = []
        phi1_cumprod = np.delete(np.concatenate(
            (np.array([1]), np.cumprod(phi1.cardinality[::-1])), axis=1)[::-1], 0)
        phi2_cumprod = np.delete(np.concatenate(
            (np.array([1]), np.cumprod(phi2.cardinality[::-1])), axis=1)[::-1], 0)

        if operation == 'M':
            if use_joblib and n_jobs != 1:
                values = Parallel(n_jobs=n_jobs, backend='threading')(
                    delayed(_parallel_helper_m)(index, phi1, phi2,
                                                phi1_indexes, phi2_indexes,
                                                phi1_cumprod, phi2_cumprod)
                    for index in product(*[range(card) for card in cardinality]))
            else:
                # TODO: @ankurankan Make this cleaner
                indexes = np.array(list(map(list, product(*[range(card) for card in cardinality]))))
                values = (phi1.values[np.sum(indexes[:, phi1_indexes] * phi1_cumprod, axis=1).ravel()] *
                          phi2.values[np.sum(indexes[:, phi2_indexes] * phi2_cumprod, axis=1).ravel()])

        elif operation == 'D':
            if use_joblib and n_jobs != 1:
                values = Parallel(n_jobs, backend='threading')(
                    delayed(_parallel_helper_d)(index, phi1, phi2,
                                                phi1_indexes, phi2_indexes,
                                                phi1_cumprod, phi2_cumprod)
                    for index in product(*[range(card) for card in cardinality]))
            else:
                # TODO: @ankurankan Make this cleaner and handle case of division by zero
                for index in product(*[range(card) for card in cardinality]):
                    index = np.array(index)
                    try:
                        values.append(phi1.values[np.sum(index[phi1_indexes] * phi1_cumprod)] /
                                      phi2.values[np.sum(index[phi2_indexes] * phi2_cumprod)])
                    except (Exceptions.InvalidValueError, FloatingPointError):
                        # zero division error should return 0 if both operands
                        # equal to 0. Ref Koller page 365, Fig 10.7
                        values.append(0)

        phi = Factor(variables, cardinality, values)
        return phi
    else:
        values = np.zeros(phi1.values.shape[0] * phi2.values.shape[0])
        phi2_shape = phi2.values.shape[0]
        if operation == 'M':
            for value_index in range(phi1.values.shape[0]):
                values[value_index * phi2_shape: (value_index + 1) * phi2_shape] = (phi1.values[value_index] *
                                                                                    phi2.values)
        elif operation == 'D':
            # reference: Koller Defination 10.7
            raise ValueError("Factors Division not defined for factors with no"
                             " common scope")
        variables = phi1_vars + phi2_vars
        cardinality = list(phi1.cardinality) + list(phi2.cardinality)
        phi = Factor(variables, cardinality, values)
        return phi
    def getAllPredictions(self, mode = 'multi'):
        logging.basicConfig(filename='results.log', level=logging.INFO)
        #predictor.displayNumbers(X,y_labels)

        ## if mode is multiprocessing, individual algorithms must perform in one job, otherwise joblib library would throw an
        ## exception. if mode is sequential individual algoritms can perform in parallel
        if mode == 'multi':
            n_jobs = 1
        else:
            n_jobs = -1

        models = []

        ## SVM configs
        svm_C = [0.1, 1, 10, 100]
        svm_gamma = ['auto', 0.03, 0.003]
        svm_kernel = ['rbf', 'linear', 'poly', 'sigmoid']
        svm_parameters = [(x, y, z) for x in svm_C for y in svm_gamma for z in svm_kernel]

        for params in svm_parameters:
            models.append(['SVM',params, svm.SVC(gamma=params[1], C=params[0], kernel=params[2])])

        ## random forest configs
        rf_nestimators = [10, 100, 300, 500]
        rf_max_features = ['auto', 'sqrt', 'log2']
        rf_max_depth =  [None, 5]
        rf_parameters = [(x, y, z) for x in rf_nestimators for y in rf_max_features for z in rf_max_depth]

        for params in rf_parameters:
            models.append(['RandomForest', params, RandomForestClassifier(n_estimators=params[0],
                                                                    max_features=params[1],
                                                                    max_depth=params[2], n_jobs = n_jobs)])

        ## adaboost configs
        ab_nestimators = [10, 100, 300, 500]
        ab_learning_rate = [0.1, 0.3, 1]
        ab_base_estimator = [DecisionTreeClassifier(max_depth=2, max_features ='auto'),
                             DecisionTreeClassifier(max_depth=5, max_features ='auto'),
                             DecisionTreeClassifier(max_features='auto')]
        ab_parameters = [(x, y, z) for x in ab_nestimators for y in ab_learning_rate for z in ab_base_estimator]

        for params in ab_parameters:
            models.append(['AdaBoost', params, AdaBoostClassifier(n_estimators = params[0], learning_rate= params[1],
                                                            base_estimator=ab_base_estimator[2])])

        ## decisiontrees configs
        dt_max_depth = [None, 2, 5]
        dt_max_features = ['auto', 'sqrt', 'log2']
        dt_parameters = [(x, y) for x in dt_max_depth for y in dt_max_features]

        for params in dt_parameters:
            models.append(['DecisionTrees', params,
                           DecisionTreeClassifier(max_depth=params[0], max_features=params[1])])

        ## MutinomialNB configs
        mnb_aplpha = [0.1, 0.3, 1]

        for params in mnb_aplpha:
            models.append(['MultinomialNB', params, MultinomialNB(alpha=params)])

        ## GaussianNB configs
        models.append(['GaussianNB', '', GaussianNB()])

        ## LogisticRegression configs
        lr_C = [0.1, 1, 10, 100]
        lr_multi_class = ['ovr']
        lr_parameters = [(x, y) for x in lr_C for y in lr_multi_class]

        for params in lr_parameters:
            models.append(['LogisticRegression', params,
                           LogisticRegression(C=params[0], multi_class=params[1], n_jobs= n_jobs)])

        ## KNeighborsClassifier configs
        knn_n_neighbors = [3, 5, 7]
        knn_p = [1, 2, 3]
        knn_algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
        knn_paramters = [(x, y, z) for x in knn_n_neighbors for y in knn_p for z in knn_algorithm]

        for params in knn_paramters:
            models.append(['KNeighbors', params,
                           KNeighborsClassifier(n_neighbors=params[0], p=params[1], algorithm=params[2], n_jobs= n_jobs)])


        ## LinearDiscriminantAnalysis configs
        lda_solver = ['svd', 'lsqr', 'eigen']
        lda_n_components = [3, 5, 8]
        lda_parameters = [(x, y) for x in lda_solver for y in lda_n_components]

        for params in lda_parameters:
            models.append(['LinearDiscriminantAnalysis', params,
                           LinearDiscriminantAnalysis(solver=params[0], n_components=params[1])])

        ## run models in multiprocessing or sequential way
        results = []
        if mode == 'multi':
            num_cores = multiprocessing.cpu_count()
            results = Parallel(n_jobs=num_cores)\
                (delayed(self.predictor.predict)(models[i][2], self.X, self.y_labels) for i in range(len(models)))
            results_all = zip(models, results)
        else:
            for i in range(len(models)):
                results.append(self.predictor.predict(models[i][2], self.X, self.y_labels))
            results_all = zip(models, results)

        sorted_results = sorted(results_all, key= lambda item: item[1], reverse = True)
        [logging.info(x) for x in sorted_results]
Example #47
0
def get_isochrone_grid(grid_feh,
                       grid_logt,
                       model='parsec12s',
                       phot='sloan',
                       Zsun=0.0152,
                       parflag=True,
                       n_jobs=8,
                       **kwargs):
    """ get a list of isochrones using EZPADOVA

    Parameters
    ----------
    grid_feh: array
        [Fe/H] grid
    grid_logt: array
        logt grid
    model: string
        default is 'parsec12s'
    phot: string
        default is 'sloan'
    Zsun: float
        default is 0.0152
    parflag: bool
        default is True
        if True, use JOBLIB to get isochrones in parallel
    n_jobs: int
        if parflat is True, specify number of jobs in JOBLIB

    Returns
    -------
    vgrid_feh, vgrid_logt, isoc_list, grid_list

    """
    # validate grid
    vgrid_feh, vgrid_logt = _find_valid_grid(grid_feh, grid_logt, Zsun=Zsun)

    # construct list
    grid_list = []
    for grid_feh_ in vgrid_feh:
        for grid_logt_ in vgrid_logt:
            grid_list.append((10.**grid_logt_, 10.**grid_feh_*Zsun))

    print('@Cham: you have requested for %s isochrones!')
    print('@Cham: -----------------------------------------------------------')

    # get isochrones
    if parflag:
        # get isochrones in parallel
        isoc_list = Parallel(n_jobs=n_jobs, verbose=True)(delayed(cmd.get_one_isochrone)(
            grid_list_[0], grid_list_[1], model=model, phot=phot, **kwargs) for grid_list_ in grid_list)
    else:
        # get isochrones sequentially
        isoc_list = []
        for i in xrange(len(grid_list)):
            grid_list_ = grid_list[i]
            print('@Cham: sending request for isochrone (logt=%s, [Fe/H]=%s) (t=%s, Z=%s) [%s/%s]...'
                  % (np.log10(grid_list_[0]), np.log10(grid_list_[1]/Zsun), grid_list_[0], grid_list_[1], i+1, len(grid_list)))
            isoc_list.append(
                Table(cmd.get_one_isochrone(grid_list_[0], grid_list_[1], model=model, phot=phot, **kwargs).data))
    print('@Cham: got all requested isochrones!')
    print('@Cham: -----------------------------------------------------------')
    print('@Cham: colnames are:')
    print(isoc_list[0].colnames)
    print('@Cham: -----------------------------------------------------------')
    return vgrid_feh, vgrid_logt, isoc_list, grid_list
Example #48
0
def mab_eval(bandit, T, pol_cfg, N_trials=100, seed=None, parallel=False):
    if seed is not None:
        np.random.seed(seed)

    all_policies = extract_policies(**pol_cfg)
    policies = []
    for p in all_policies:
        if p.name in pol_cfg['names']:
            policies.append(p)
    names = [p.name for p in policies]

    arm_dists = [bandit.resample_arms() for _ in range(N_trials)]
    results = []
    print 'Evaluating Policies {}'.format(names)
    if parallel == 1:                   

        rc = ipp.Client(profile='ssh')

        dv = rc[:]
        n_clients = len(dv)
        with dv.sync_imports():
            import mab
        v = rc.load_balanced_view()
        
        results = v.map(eval_helper, arm_dists, [bandit.arm_prior] * N_trials, 
                        [T]*N_trials, [pol_cfg]*N_trials, [frozenset(names)] * N_trials,
                        [seed + inum for inum in range(N_trials)])

        start = time.time()        
        while rc.outstanding:
            try:
                rc.wait(rc.outstanding, 1e-1)
            except ipp.TimeoutError:
                # ignore timeouterrors
                pass
            n_complete = N_trials - len(rc.outstanding)
            if n_complete > 0:
                est_remaining = ((time.time() - start) / n_complete) * len(rc.outstanding)
            else:
                est_remaining = 'No Estimate'
            sys.stdout.write('\rFinished {} / {} jobs\tEstimated Time Remaining: {}'.format(n_complete, N_trials, est_remaining))
            sys.stdout.flush()
    elif parallel == 2:
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=7, verbose=50)(delayed(_eval_helper)(
                ad, bandit.arm_prior, T, pol_cfg, names, seed + inum) for 
                inum, ad in enumerate(arm_dists))
    else:

        for inum, ad in enumerate(arm_dists):
            results.append(eval_helper(ad, bandit.arm_prior, T, pol_cfg, names, seed=seed+inum))
            sys.stdout.write("{} / {}\t".format(inum, N_trials))
            sys.stdout.flush()
    means = []
    variances = []
    avg_err = []
    discounted_mean = []
    for j in range(len(policies)):
        try:
            regrets, choices, discounted = results[0].get()
        except CompositeError, e:
            print e
            import IPython; IPython.embed()

        regrets = regrets[j]
        choices = choices[j]
        discounted = discounted[j]
        errors = np.array(choices != bandit.ibest, dtype=np.int)
        for i in range(1, N_trials):
            regrets_i, choices, discounted_i = results[i].get()
            regrets = np.c_[regrets, regrets_i[j]]
            errors += (choices[j] != bandit.ibest)
            discounted += discounted_i[j]
        discounted /= N_trials
        discounted_mean.append(discounted)
        means.append(np.mean(regrets, axis=1))
        variances.append(np.var(regrets, axis=1))
        avg_err.append(errors / N_trials)
Example #49
0
def glm(conditions, onsets, TR, Y, drifts=None, basis='3hrf', mode='r1glm',
        hrf_length=20, oversample=5, 
        rtol=1e-8, verbose=False, maxiter=500, callback=None,
        method='L-BFGS-B', n_jobs=1, hrfs=None,
        return_design_matrix=False):
    """
    Perform a GLM from BOLD signal, given the conditons, onset,
    TR (repetition time of the scanner) and the BOLD signal.

    This method is able to fir a variety of models, available
    through the `mode` keyword. These are:

        - glm: standard GLM
        - glms: GLM with separate designs
        - r1glm: Rank-1 GLM
        - r1glms: Rank-1 GLM with separate designs

    basis:

        - hrf: single element basis
        - 3hrf: basis with 3 elements
        - fir: basis with hrf_length elements (in multiples of TR)

    **Note** the output parameters need are not normalized. 
    Rank-1 models are specified up to a constant 
    term between the betas and the HRF. This implies that some
    normalization must be done prior to interpreting the activation
    coefficients. Typically the HRF is normalized to 
    have unit amplitude and to correlate positively with a 
    reference HRF.


    Parameters
    ----------

    conditions: array-like, shape (n_trials)
        array of conditions

    onsets: array-like, shape (n_trials)
        array of onsets

    TR: float
        Repetition Time, the delay between two succesive
        aquisitions of the same image.

    Y : array-like, shape (n_scans, n_voxels)
        Time-series vector.

    mode: {'r1glm', 'r1glms', 'glms', 'glm'}
        Different GLM models.

    rtol : float
        Relative tolerance for stopping criterion.

    maxiter : int
        maximum number of iterations

    verbose : {0, 1, 2}
        Different levels of verbosity

    n_jobs: int
        Number of CPUs to use. Use -1 to use all available CPUs.

    method: {'L-BFGS-B', 'TNC'}
        Different algorithmic solvers, only used for 'r1*' modes.
        All should yield the same result but their efficiency might vary.

    Returns
    -------
    U : array
        Estimated HRF. Will be of shape (basis_len, n_voxels) for rank-1
        methods and of (basis_len, n_conditions, n_voxels) for the other
        methods.

    V : array, shape (p, n_voxels)
        Estimated activation coefficients (beta-map).

    dmtx: array,
        Design matrix. Only returned if return_design_matrix=True

    """
    if not mode in ('glm', 'r1glm', 'r1glms', 'glms'):
        raise NotImplementedError
    conditions = np.asarray(conditions)
    onsets = np.asarray(onsets)
    if conditions.size != onsets.size:
        raise ValueError('array conditions and onsets should have the same size')
    Y = np.asarray(Y)
    n_scans = Y.shape[0]
    verbose = int(verbose)
    if verbose > 0:
        print('.. creating design matrix ..')
    if drifts is None:
        drifts = np.ones((n_scans, 1))

    X_design, Q = create_design_matrix(
        conditions, onsets, TR, n_scans, basis, oversample, hrf_length)
    if verbose > 0:
        print('.. done creating design matrix ..')

    if Y.ndim == 1:
        Y = Y.reshape((-1, 1))
    n_task = Y.shape[1]

    size_u = Q.shape[1]
    size_v = X_design.shape[1] // size_u

    if mode == 'glms':
        U, V = utils.glms_from_glm(
            X_design, Q, n_jobs, False, Y)
    elif mode == 'glm':
        U, V = utils.glm(
            X_design, Q, Y, convolve=False)
    elif mode in ('r1glm', 'r1glms'):
        U = np.zeros((size_u, n_task))
        V = np.zeros((size_v, n_task))
        if verbose > 0:
            print('.. computing initialization ..')
        X_design_canonical, Q_canonical = create_design_matrix(conditions, onsets, TR,
            n_scans, [hrf.spmt], oversample, hrf_length)
        X_design_canonical = np.concatenate(
            (X_design_canonical, drifts), axis=1)
        V_init = linalg.lstsq(X_design_canonical, Y)[0]
        U_init = np.tile(linalg.lstsq(Q, Q_canonical)[0], n_task)
        if mode == 'r1glm':
            W_init = np.concatenate((U_init, V_init))
        else:
            # XXX TODO intercept
            W_init = np.concatenate((U_init, V_init[:-1], V_init[:-1]))
        if verbose > 0:
            print('.. done initialization ..')

        if n_jobs == -1:
            n_jobs = cpu_count()
        Y_split = np.array_split(Y, n_jobs, axis=1)
        W_init_split = np.array_split(W_init, n_jobs, axis=1)
        X_design = sparse.csr_matrix(X_design)

        out = Parallel(n_jobs=n_jobs)(
            delayed(rank_one)(
                X_design, y_i, size_u, w_i, drifts=drifts, callback=callback, maxiter=maxiter,
                method=method, rtol=rtol, verbose=verbose, mode=mode, hrfs=hrfs, basis=basis)
            for y_i, w_i in zip(Y_split, W_init_split))

        counter = 0
        for tmp in out:
            u, v = tmp
            u = u.T
            v = v.T
            for i in range(len(u)):
                U[:, counter] = u[i]
                V[:, counter] = v[i]
                counter += 1

        raw_U = U.copy()
        # normalize
    if mode in ('r1glm',) and basis == '3hrf':
        xx = np.linspace(0, hrf_length * TR)
        generated_hrfs = U[0] * hrf.spmt(xx)[:, None] + \
            U[1] * hrf.dspmt(xx)[:, None] + U[2] * hrf.ddspmt(xx)[:, None]
        sign = np.sign(np.dot(generated_hrfs.T, hrf.spmt(xx)))
        norm = np.abs(generated_hrfs).max(0)
        U = U * sign / norm
        V = V * sign * norm
    elif mode in ('r1glm',) and basis == '2hrf':
        xx = np.linspace(0, hrf_length * TR)
        generated_hrfs = U[0] * hrf.spmt(xx)[:, None] + \
            U[1] * hrf.dspmt(xx)[:, None]
        sign = np.sign(np.dot(generated_hrfs.T, hrf.spmt(xx)))
        norm = np.abs(generated_hrfs).max(0)
        U = U * sign / norm
        V = V * sign * norm
    elif mode == 'r1glm' and basis == 'fir':
        xx =  np.arange(0, TR * hrf_length, TR)
        sign = np.sign(np.dot(U.T, hrf.spmt(xx)))
        norm = np.abs(U).max(0)
        U = U * sign / norm
        V = V * sign * norm
    out = [U, V]
    if return_design_matrix:
        out.append(X_design.toarray())
    return out
Example #50
0
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    logger.print_timestamp()
    logger.main_info('Running Contig analyzer...')
    success_compilation = compile_aligner(logger)
    if not success_compilation:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')
        return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None

    if qconfig.draw_plots:
        compile_gnuplot(logger, only_clean=False)

    num_nf_errors = logger._num_nf_errors
    create_nucmer_output_dir(output_dir)
    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    threads = max(1, qconfig.max_threads // n_jobs)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.splitted_ref and not qconfig.memory_efficient:
        statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
        is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
             for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
    else:
        if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient:
            statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)(
            is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads)
                for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)))
        else:
            statuses_results_lengths_tuples = []
            for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)):
                statuses_results_lengths_tuples.append(align_and_analyze(
                is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath,
                parallel_by_chr=True, threads=qconfig.max_threads))

    # unzipping
    statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)]
    reports = []

    nucmer_statuses = dict(zip(contigs_fpaths, statuses))
    aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths))
    misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs))

    if NucmerStatus.OK in nucmer_statuses.values():
        if qconfig.is_combined_ref:
            save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger)

    for index, fname in enumerate(contigs_fpaths):
        report = reporting.get(fname)
        if statuses[index] == NucmerStatus.OK:
            reports.append(save_result(results[index], report, fname, reference))
        elif statuses[index] == NucmerStatus.NOT_ALIGNED:
            save_result_for_unaligned(results[index], report)

    if NucmerStatus.OK in nucmer_statuses.values():
        reporting.save_misassemblies(output_dir)
        reporting.save_unaligned(output_dir)
        from . import plotter
        if qconfig.draw_plots:
            plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies')
        if qconfig.draw_plots or qconfig.html_report:
            misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths)))
            plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs,
                             join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies')

    oks = list(nucmer_statuses.values()).count(NucmerStatus.OK)
    not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED)
    failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED)
    errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR)
    problems = not_aligned + failed + errors
    all = len(nucmer_statuses)

    logger._num_nf_errors = num_nf_errors + errors

    if oks == all:
        logger.main_info('Done.')
    if oks < all and problems < all:
        logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.')
    if problems == all:
        logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.')

    return nucmer_statuses, aligned_lengths_per_fpath
Example #51
0
pickle.dump( lis, open( "test-windows.pkl", "wb" ) )
#lis = pickle.load( open( "test-windows.pkl", "rb" ) )


##Finding-windows
k = len(lis)/10 #no of chunks, jobs
iterator = range(0,len(lis)-k,k)
from joblib import Parallel, delayed
from parr_test import myfunc

pdb.set_trace()
results = Parallel(n_jobs=-1)(delayed(myfunc)(lis[i:i+k]) for i in iterator)


if len(lis[iterator[-1]+k:]) >= 2:
    results.append(myfunc(lis[iterator[-1]+k:]))

detects = np.concatenate(results)

##Plotting result
windows=[]
for w in x_list:
    for h in y_list:
        windows.append((h,w))

ind = np.where(detects==1)[0]

ws1=[]
for i in ind:
    ws1.append(windows[i])
def mab_eval(bandit, T, pol_cfg, N_trials=100, seed=None, parallel=False):
    if seed is not None:
        np.random.seed(seed)
        seed += 1

    all_policies = extract_policies(**pol_cfg)
    policies = []
    for p in all_policies:
        if p.name in pol_cfg['names']:
            policies.append(p)
    names = [p.name for p in policies]

    arm_dists = [bandit.resample_arms() for _ in range(N_trials)]
    # print [U for (ad, U) in arm_dists]
    results = []
    if parallel == 1:

        rc = ipp.Client(profile='ssh')

        dv = rc[:]
        n_clients = len(dv)
        with dv.sync_imports():
            import mab
        v = rc.load_balanced_view()
    
        print 'Evaluating Policies {}'.format(names)
        
        results = v.map(eval_helper, arm_dists, [bandit.theta_prior] * N_trials, 
                        [T]*N_trials, [pol_cfg]*N_trials, [frozenset(names)] * N_trials,
                        [(seed + inum) for inum in range(N_trials)])

        start = time.time()
        rate = 0
        n_complete = 0

        while rc.outstanding:
            try:
                rc.wait(rc.outstanding, 1e-1)
            except ipp.TimeoutError:
                # ignore timeouterrors
                pass
            if n_complete < N_trials - len(rc.outstanding):
                n_complete = N_trials - len(rc.outstanding)
                rate = ((time.time() - start) / n_complete)
            if n_complete > 0:
                est_remaining = rate * len(rc.outstanding)
            else:
                est_remaining = 'No Estimate'
            sys.stdout.write(
                '\rFinished {} / {} jobs\tEstimated Time Remaining: {:.4}'.format(
                    n_complete, N_trials, est_remaining))
            sys.stdout.flush()
    elif parallel == 2:
        from joblib import Parallel, delayed
        print 'Evaluating Policies {}'.format(names)
        results = Parallel(n_jobs=7, verbose=50)(delayed(_eval_helper)(
                ad, bandit.theta_prior, T, pol_cfg, names, seed + inum) for 
                inum, ad in enumerate(arm_dists))
    else:
        for inum, ad in enumerate(arm_dists):            
            results.append(
                eval_helper(
                    ad, bandit.theta_prior, T, pol_cfg, names, seed=seed+inum))
            sys.stdout.write("{} / {}\t".format(inum, N_trials))
            sys.stdout.flush()
    means = []
    variances = []
    avg_err = []
    discounted_mean = []
    try:
        if type(results[0]) == list:
            results = [x[0] for x in results]
    except CompositeError, e:
        print e
        import IPython; IPython.embed()
Example #53
0
def pmultiquery(corpus, 
    search,
    show = 'words',
    query = 'any', 
    sort_by = 'total', 
    quicksave = False,
    multiprocess = 'default', 
    function_filter = False,
    just_speakers = False,
    root = False,
    note = False,
    print_info = True,
    **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) just speakers == 'each', or a list of speakers with len(list) > 1
    
    This function needs joblib 0.8.4 or above in order to run properly.
    There's no reason to call it yourself."""
    
    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and type(search) != dict:
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))
    elif hasattr(function_filter, '__iter__'):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(list(function_filter.keys())))
        denom = len(list(function_filter.keys()))
    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)
        
    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')
    
    # the options that don't change
    d = {
         #'paralleling': True,
         'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['function_filter'] = q
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, val in enumerate(search):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('do_concordancing') is False:
        message = 'Interrogating'
    elif kwargs.get('do_concordancing') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('do_concordancing').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    for i, (k, v) in enumerate(list(search.items())):
        if type(v) == list:
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        else:
            vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(search.keys()) - 1:
            sformat += '\n                  '

    if multiple_corpora and not multiple_option:
        corplist = "\n              ".join([i.name for i in corpus[:20]])
        if len(corpus) > 20:
            corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
           "\n          Query: '%s'\n          %s corpus ... \n"  % (time, len(corpus), num_cores, corplist, sformat, message)))

    elif multiple_queries:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) ))

    elif multiple_search:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message)))

    elif multiple_option:
        print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    elif multiple_speakers:
        print(("\n%s: Beginning %d parallel corpus interrogations: %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance
    if kwargs.get('do_concordancing') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            out[interrog.query['outname']] = interrog
    
        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in list(out.items()):
                save(v, k, savedir = fullpath, print_info = False)
        
            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (time, "'\n         '".join(sorted(out.keys()))))
        from interrogation import Interrodict
        return Interrodict(out)
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index = [i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            out = pd.concat([r.results for r in res], axis = 1)
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis = 0)
        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \
                      df1_always_df = kwargs.get('df1_always_df'))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('do_concordancing') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index = True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        #if used_joblib:
            
        if quicksave:
            from other import save
            save(out, quicksave)
        print('\n')
        return out
Example #54
0
def pmultiquery(
    path,
    option="c",
    query="any",
    sort_by="total",
    quicksave=False,
    num_proc="default",
    function_filter=False,
    just_speakers=False,
    root=False,
    note=False,
    print_info=True,
    **kwargs
):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) function_filter is iterable
        d) just speakers == 'each'
    
    This function needs joblib 0.8.4 or above in order to run properly."""

    import collections
    import os
    import pandas
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    from interrogator import interrogator
    from editor import editor
    from other import save_result

    try:
        from joblib import Parallel, delayed
    except:
        pass
        # raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit

        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])
            else:
                import math

                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores:
                        return int(square_root)
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # are we processing multiple queries or corpora?
    # find out optimal number of cores to use.
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False

    denom = 1
    if hasattr(path, "__iter__"):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(path))
        denom = len(path)
    elif hasattr(query, "__iter__"):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(function_filter, "__iter__"):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(function_filter.keys()))
        denom = len(function_filter.keys())
    elif just_speakers:
        from corpkit.build import get_speaker_names_from_xml_corpus

        multiple_speakers = True
        if just_speakers == "each":
            just_speakers = get_speaker_names_from_xml_corpus(path)
        if len(just_speakers) == 0:
            print "No speaker name data found."
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if num_proc != "default":
        num_cores = num_proc

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError("quicksave must be string when using pmultiquery.")

    # the options that don't change
    d = {
        "option": option,
        #'paralleling': True,
        "function": "interrogator",
        "root": root,
        "note": note,
        "denominator": denom,
    }
    # add kwargs to query
    for k, v in kwargs.items():
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        path = sorted(path)
        for index, p in enumerate(path):
            name = os.path.basename(p)
            a_dict = dict(d)
            a_dict["path"] = p
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = q
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["function_filter"] = q
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = [name]
            a_dict["function_filter"] = function_filter
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)

    time = strftime("%H:%M:%S", localtime())
    if multiple_corpora and not multiple_option:
        print (
            "\n%s: Beginning %d parallel corpus interrogations:\n              %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, "\n              ".join(path), query)
        )

    elif multiple_queries:
        print (
            "\n%s: Beginning %d parallel corpus interrogations: %s"
            "\n          Queries: '%s'"
            "\n          Interrogating corpus ... \n"
            % (time, num_cores, os.path.basename(path), "', '".join(query.values()))
        )

    elif multiple_option:
        print (
            "\n%s: Beginning %d parallel corpus interrogations (multiple options): %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query)
        )

    elif multiple_speakers:
        print (
            "\n%s: Beginning %d parallel corpus interrogations: %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query)
        )

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    # import sys
    # reload(sys)
    # stdout=sys.stdout
    failed = False
    # ds = ds[::-1]
    if not root:
        from blessings import Terminal

        terminal = Terminal()
        print "\n" * (len(ds) - 2)
        for dobj in ds:
            linenum = dobj["paralleling"]
            with terminal.location(0, terminal.height - (linenum + 1)):
                # this is a really bad idea.
                thetime = strftime("%H:%M:%S", localtime())
                print "%s: [                      0%% (%s)                            ]" % (thetime, dobj["outname"])

        # res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            # ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True)
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            print "\n\n\n"
        except:
            failed = True
            print "Multiprocessing failed."
            raise
        try:
            res = sorted(res)
        except:
            failed = True
            pass
    elif root or failed:
        res = []
        for index, d in enumerate(ds):
            d["startnum"] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    # from multiprocessing import Process
    # from corpkit.interrogator import interrogator
    # jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    # result_queue = multiprocessing.Queue()
    #
    # for d in ds:
    # funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    # jobs = [multiprocessing.Process(mc) for mc in funs]
    # for job in jobs: job.start()
    # for job in jobs: job.join()
    # results = [result_queue.get() for mc in funs]

    # turn list into dict of results, make query and total branches,
    # save and return
    if not option.startswith("c"):
        out = {}
        # print ''
        for (name, data), d in zip(res, ds):
            for unpicklable in ["note", "root"]:
                try:
                    del d[unpicklable]
                except KeyError:
                    pass
            if not option.startswith("k"):
                outputnames = collections.namedtuple("interrogation", ["query", "results", "totals"])
                try:
                    stotal = data.sum(axis=1)
                    stotal.name = u"Total"
                except ValueError:
                    stotal = data.sum()
                output = outputnames(d, data, stotal)
            else:
                outputnames = collections.namedtuple("interrogation", ["query", "results"])
                output = outputnames(d, data)
            out[name] = output

        # could be wrong for unstructured corpora?
        if quicksave:
            fullpath = os.path.join("saved_interrogations", quicksave)
            while os.path.isdir(fullpath):
                selection = raw_input(
                    "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: "
                    % (quicksave, "saved_interrogations")
                )
                if selection == "o" or selection == "O":
                    import shutil

                    shutil.rmtree(fullpath)
                else:
                    import os

                    fullpath = os.path.join("saved_interrogations", selection)

            for k, v in out.items():
                save_result(v, k, savedir=fullpath, print_info=False)

            time = strftime("%H:%M:%S", localtime())
            print "\n%s: %d files saved to %s" % (time, len(out.keys()), fullpath)

        time = strftime("%H:%M:%S", localtime())
        print "\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (
            time,
            "'\n         '".join(sorted(out.keys())),
        )

        return out
    # make query and total branch, save, return
    else:
        out = pd.concat(res, axis=1)
        out = editor(out, sort_by=sort_by, print_info=False, keep_stats=False)
        time = strftime("%H:%M:%S", localtime())
        print "\n\n%s: Finished! %d unique results, %d total." % (time, len(out.results.columns), out.totals.sum())
        if quicksave:
            from other import save_result

            save_result(out, quicksave)
        return out