def __init__(self): global transport variables = ['meridional_transport','psi'] num_cores = 6 data = np.ones((len(variables),len(scow.months),scow.latitude.shape[0],scow.longitude.shape[0]))*np.nan beta = c.beta.repeat(scow.longitude.shape[0]).reshape((c.beta.shape[0],scow.longitude.shape[0])) for i in xrange(scow.data.shape[1]): transport = scow.data[2,i,:,:]/beta psi = Parallel(n_jobs=num_cores)(delayed(integration)(lat) for lat in scow.latitude) psi = np.array(psi) D = np.array([transport.copy()/(c.rho*1.e+6),psi.copy()/(c.rho*1.e+6)]) data[:,i,:,:] = D del transport # Here I could derive psi in y and get the zonal sverdrup transport (I think I won't need it) # "Isolating" the subtropical gyre ibad = (np.abs(scow.latitude) <= 5) | (np.abs(scow.latitude) >= 50) data[:,:,ibad,:] = np.nan self.latitude = scow.latitude self.longitude = scow.longitude self.variables = variables self.data = data
def __init__(self): global transport variables = ['zonal_transport','meridional_transport','psi'] num_cores = 6 data = np.ones((len(variables),len(scow.months),scow.latitude.shape[0],scow.longitude.shape[0]))*np.nan f = c.f.repeat(scow.longitude.shape[0]).reshape((c.f.shape[0],scow.longitude.shape[0])) for i in xrange(scow.data.shape[1]): zonal_transport = scow.data[1,i,:,:]/(f*c.rho) meridional_transport = -scow.data[0,i,:,:]/(f*c.rho) transport = meridional_transport.copy() psi = Parallel(n_jobs=num_cores)(delayed(integration)(lat) for lat in scow.latitude) psi = np.array(psi) D = np.array([zonal_transport.copy()/1.e+6,meridional_transport.copy()/1.e+6,psi.copy()/1.e+6]) data[:,i,:,:] = D del transport # "Isolating" the subtropical gyre ibad = (np.abs(scow.latitude) <= 5) | (np.abs(scow.latitude) >= 50) data[:,:,ibad,:] = np.nan self.latitude = scow.latitude self.longitude = scow.longitude self.variables = variables self.data = data
def main(): parser = argparse.ArgumentParser(description='Register & align images') parser.add_argument('filenames',nargs='+',help='List of target files to register. Images are aligned to first in list.') parser.add_argument('-odir',metavar='outdir',required=True,type=str,help='Output directory for files.') parser.add_argument('-m',metavar='method',choices=('point','extended'),default='extended',help='Specify alignment method (point or extended); default=extended.') parser.add_argument('-xy',nargs=2,type=float,default=None,help='Specify approximate "x y" pixel coordinate of object to centroid on. Required for point mode; useful for extended mode (default=center of image).') parser.add_argument('-box',nargs=2,type=int,default=None,help='Specify box size (w h) to restrict alignment search. Useful for both point & extended modes (default=full size of array).') parser.add_argument('--c',action='store_true',help='Clobber (overwrite) on output') parser.add_argument('-njobs',type=int,default=1,help='Process images in parallel. "-1" is all CPUs (default=1).') args = parser.parse_args() if args.m == 'point' and args.xy is None: parser.error("-m point requires -xy coordinate") # create output directory if args.odir not in ['','.']: makedirs(args.odir,exist_ok=True) # align all images to first filename ref = args.filenames[0] align = args.filenames[1:] imref = partial(register,ref=ref,outdir=args.odir, method=args.m,center=args.xy,size=args.box, overwrite=args.c) outfiles = Parallel(n_jobs=args.njobs,verbose=11)(delayed(imref)(toshift=a) for a in align) # Write ref to outdir refnew = os.path.join(args.odir,os.path.basename(ref)) copy(ref,refnew) outfiles.append(refnew) print('Wrote %i files to %s' % (len(outfiles), args.odir))
def auto_choose(actionfile, new_xyz, nparallel=-1): """ @param demofile: h5py.File object @param new_xyz : new rope point-cloud @nparallel : number of parallel jobs to run for tps cost calculaion. If -1 only 1 job is used (no parallelization). @return : return the name of the segment with the lowest warping cost. """ if not nparallel == -1: from joblib import Parallel, delayed nparallel = min(nparallel, 8) demo_data = actionfile.items() if nparallel != -1: before = time.time() redprint("auto choose parallel with njobs = %d"%nparallel) costs = Parallel(n_jobs=nparallel, verbose=0)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] redprint("auto choose sequential..") for i, ddata in enumerate(demo_data): costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz)) print(("tps-cost completed %i/%i" % (i + 1, len(demo_data)))) ibest = np.argmin(costs) redprint ("auto choose returning..") return demo_data[ibest][0]
def load_glm_inputs(study_dirs, hrf_model='canonical', drift_model='cosine', img_ext='nii.gz', memory=Memory(None), n_jobs=1): """Returns data (almost) ready to be used for a GLM. """ datasets, structural, functional, conditions, contrasts = \ collect_openfmri(study_dirs, img_ext=img_ext, memory=memory, n_jobs=n_jobs) main = functional.merge(conditions) # computing design matrices print 'Computing models...' results = Parallel(n_jobs=n_jobs, pre_dispatch='n_jobs')( delayed(memory.cache(_make_design_matrix))( run_df, hrf_model, drift_model, orthogonalize=datasets[group_id[0]]['models'][group_id[2]]['orthogonalize']) for group_id, group_df in main.groupby(['study', 'subject', 'model']) for run_id, run_df in group_df.groupby(['task', 'run']) ) # collect results print 'Collecting...' glm_inputs = {} for group_id, group_df in main.groupby(['study', 'subject', 'model']): study_id, subject_id, model_id = group_id for session_id, run_df in group_df.groupby(['task', 'run']): task_id, run_id = session_id bold_file, dm = results.pop(0) glm_inputs.setdefault(group_id, {}).setdefault('bold', []).append(bold_file) glm_inputs.setdefault(group_id, {}).setdefault('design', []).append(dm) glm_inputs.setdefault(group_id, {}).setdefault( model_id, _make_contrasts(datasets, study_id, model_id, hrf_model, group_df)) glm_inputs.setdefault(group_id, {}).setdefault( '%s_per_run' % model_id, _make_contrasts( datasets, study_id, model_id, hrf_model, group_df, per_run=True)) return glm_inputs
def find_closest_auto(demofile, new_xyz): if args.parallel: from joblib import Parallel, delayed demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()] keys = demofile.keys() if args.parallel: costs = Parallel(n_jobs=3,verbose=100)(delayed(registration_cost)(demo_cloud, new_xyz) for demo_cloud in demo_clouds) else: costs = [] for (i,ds_cloud) in enumerate(demo_clouds): costs.append(registration_cost(ds_cloud, new_xyz)) print "completed %i/%i"%(i+1, len(demo_clouds)) print "costs\n",costs if args.show_neighbors: nshow = min(5, len(keys)) import cv2, rapprentice.cv_plot_utils as cpu sortinds = np.argsort(costs)[:nshow] near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds] bigimg = cpu.tile_images(near_rgbs, 1, nshow) cv2.imshow("neighbors", bigimg) print "press any key to continue" cv2.waitKey() ibest = np.argmin(costs) return keys[ibest]
def _update_filters(self, X): if self.verbose: last_score = self._bound(X) start_t = time.time() U = Parallel(n_jobs=self.n_jobs)( delayed(global_update_U)( X[:, j], self.U[:, j], self.gamma[j], self.alpha, self.nu, self.rho, self.EA, self.ElogA, self.verbose ) for j in xrange(self.n_feats) ) U = np.vstack(U).T self.U = U.copy() if self.verbose: score = self._bound(X) print_increment('U', last_score, score) last_score = score self._update_gamma(X) if self.verbose: score = self._bound(X) print_increment('gamma', last_score, score) last_score = score self._update_alpha(X) if self.verbose: score = self._bound(X) print_increment('alpha', last_score, score) if self.verbose: t = time.time() - start_t print('Update free parameters\ttime: %.2f' % t)
def analysis(self, permute=False): """ Classify based an iteratively increasing the number of features (electrodes) included in the model. Starts with the single best electrode (N=1) and increase until N = the number of electrodes. Note: permute is not used in this analysis, but kept to match the same signature as super. """ if self.subject_data is None: print('%s: compute or load data first with .load_data()!' % self.subject) # Get recalled or not labels if self.recall_filter_func is None: print('%s classifier: please provide a .recall_filter_func function.' % self.subject) y = self.recall_filter_func(self.subject_data) # zscore the data by session x = self.zscore_data() # create the classifier classifier = LogisticRegression(C=self.C, penalty=self.norm, solver='liblinear') # create .num_rand_splits of cv_dicts cv_dicts = [self._make_cross_val_labels() for _ in range(self.num_rand_splits)] # run permutations with joblib f = _par_compute_and_run_split if self.use_joblib: aucs = Parallel(n_jobs=12, verbose=5)(delayed(f)(cv, classifier, x, y) for cv in cv_dicts) else: aucs = [] for cv in tqdm(cv_dicts): aucs.append(f(cv, classifier, x, y)) # store results self.res['auc_x_n'] = np.stack(aucs)
def preprocess(file_in, file_out, test=False, n_jobs=6): """ This function preprocesses raw data file. For each row and for each feature it extracts aggregations over TimeToEnd: From feature TimeToEnd it extracts total time ("time") and number of observations ("n_obs") From feature DistanceToRadar it extracts aggregations ('min', '50% quantile', 'mean', 'max') For any other features it calculates ('mean', 'std', 'min', '50% quantile', 'max') New features names follow the pattern: <feature name>_<aggregation function> Parameters ---------- :param file_in: str csv-file name for data to be preprocessed :param file_out: str csv-file name for output data :param test: bool indicator for test data (data without label) :return: """ # Load data to pandas.DataFrame data_raw = pd.read_csv(file_in, na_filter=False, chunksize=5000) # Apply transformations to data chunks in parallel start = time.time() data = Parallel(n_jobs=n_jobs, verbose=11)(delayed(foo)(x, transform, axis=1, test=test) for i, x in enumerate(data_raw)) print "Preprocessing time: ", round((time.time() - start) / 60, 3) print "Records: ", len(data) # Join data chunks and save result to csv data = pd.concat(data) data.to_csv(file_out, index=False) print "File", file_in, "preprocessed to", file_out
def best_classifier(X,Y,Xvs,Yvs): parameters = {'C':[3,13,67,330,1636,8103]} pg = ParameterGrid(parameters) clas = Parallel(n_jobs=4)(delayed(pfit)(p,X,Y,Xvs,Yvs) for p in pg) clas.sort(reverse=True) (sc,cla) = clas[0] print '-'*20 print 'best is ',cla,sc print '-'*20 return cla,sc
def retrieve_proposals(video_info, model, feature_filename, feat_size=16, stride_intersection=0.1): """Retrieve proposals for a given video. Parameters ---------- video_info : DataFrame DataFrame containing the 'video-name' and 'video-frames'. model : dict Dictionary containing the learned model. Keys: 'D': 2darray containing the sparse dictionary. 'cost': Cost function at the last iteration. 'durations': 1darray containing typical durations (n-frames) in the training set. 'type': Dictionary type. feature_filename : str String containing the path to the HDF5 file containing the features for each video. The HDF5 file must contain a group for each video where the id of the group is the name of the video; and each group must contain a dataset containing the features. feat_size : int, optional Size of the temporal extension of the features. stride_intersection : float, optional Percentage of intersection between temporal windows. """ feat_obj = FeatHelper(feature_filename, t_stride=1) candidate_df = generate_candidate_proposals(video_info, model['durations'], feat_size, stride_intersection) D = model['D'] params = model['params'] feat_obj.open_instance() feat_stack = feat_obj.read_feat(video_info['video-name']) feat_obj.close_instance() n_feats = feat_stack.shape[0] candidate_df = candidate_df[ (candidate_df['f-init'] + candidate_df['n-frames']) <= n_feats] candidate_df = candidate_df.reset_index(drop=True) proposal_df = Parallel(n_jobs=-1)(delayed(wrapper_score_proposals)(this_df, D, feat_stack, params, feat_size) for k, this_df in candidate_df.iterrows()) proposal_df = pd.concat(proposal_df, axis=1).T proposal_df['score'] = ( proposal_df['score'] - proposal_df['score'].min()) / ( proposal_df['score'].max() - proposal_df['score'].min()) proposal_df['score'] = np.abs(proposal_df['score'] - 1.0) proposal_df = proposal_df.loc[proposal_df['score'].argsort()[::-1]] proposal_df = proposal_df.rename(columns={'n-frames': 'f-end'}) proposal_df['f-end'] = proposal_df['f-init'] + proposal_df['f-end'] - 1 return proposal_df.reset_index(drop=True)
def basic_compute_loop(compute_function,looper,run_parallel=True,debug=False): """Canonical form of the basic compute loop.""" start = time.time() if run_parallel: incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)( delayed(compute_function,has_shareable_memory)(**looper[ll]) for ll in framelooper(len(looper),start=start)) else: incoming = [] for ll in framelooper(len(looper)): incoming.append(compute_function(**looper[ll])) return incoming
def auto_choose(actionfile, new_xyz, softmin_k = 1, softmin_alpha = 1, nparallel=-1): """ @param demofile : h5py.File object @param new_xyz : new rope point-cloud @param softmin : use softmin distribution over first <softmin> demonstrations set to 1 for nearest neighbor @param nparallel : number of parallel jobs to run for tps cost calculaion set to -1 for no parallelization @return : return the name of the segment with the lowest warping cost. """ if not nparallel == -1: from joblib import Parallel, delayed nparallel = min(nparallel, 8) demo_data = actionfile.items() if nparallel != -1: before = time.time() redprint("auto choose parallel with njobs = %d"%nparallel) costs = Parallel(n_jobs=nparallel, verbose=100)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] redprint("auto choose sequential..") for i, ddata in enumerate(demo_data): costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz)) print(("tps-cost completed %i/%i" % (i + 1, len(demo_data)))) # use a random draw from the softmin distribution demo_costs = zip(costs, demo_data) if softmin_k == 1: ibest = np.argmin(costs) return demo_data[ibest][0] best_k_demos = np.asarray(sorted(demo_costs)[:softmin_k]) best_k_exps = np.exp(-1*softmin_alpha*float(best_k_demos[:, 0])) #multiply by -1 b/c we're actually min-ing if len(best_k_exps) > 1: denom = sum(best_k_exps) else: denom = best_k_exps mass_fn = best_k_exps/denom draw = random.random() for i in range(best_k_demos): if draw <= mass_fn[i]: ret_val = demo_data[i][0] break draw -= mass_fn[i] redprint ("auto choose returning..") return ret_val
def extract_all_class_features(dataset, n_jobs=1, stride=5, patch_size=10): """Extract masked features from all dataset images, return features and labels""" cns = [] labels = [] for (label, cls) in enumerate(dataset.classes): print 'Extracting masked CNs from class {}'.format(cls) hists = Parallel(n_jobs=n_jobs)(delayed(extract_masked_cns)(imname, maskname) for (imname, maskname) in dataset.get_class_images(cls)) hists = np.vstack(hists) labels.append(label * np.ones((len(hists),), dtype=np.float32)) cns.append(hists.astype(np.float32)) # Stack lists in numpy arrays. return (cns, labels)
def train(self): regressors = [] if self.parallel: regressors = Parallel(n_jobs=-1)(delayed(trainBin)(self.params[b], np.atleast_2d(self.ind).T, self.dep[b],self.indWeights) for b in self.OD.bins) else: for b in self.OD.bins: regressors.append(trainBin(self.params[b],np.atleast_2d(self.ind).T, self.dep[b],self.indWeights)) #self.svr[b] = SVR(cache_size=1000,kernel='rbf', C=self.params[b]['C'], gamma=self.params[b]['gamma']) #self.svr[b].fit(np.array([self.ind]).T,self.dep[b]) for i,model in enumerate(regressors): self.svr[self.OD.bins[i]] = model
def create_training_data(): num_cores = 8 # getting total number of trips list_of_files = [[folder, f.replace('.csv','')] for folder in os.listdir('drivers') if 'DS_Store' not in folder for f in os.listdir('drivers/'+folder) if '.csv' in f] raw_data = Parallel( n_jobs=num_cores )(delayed(create_attributes)(i) for i in list_of_files) raw_data = pd.DataFrame(raw_data) raw_data.columns = ['driver_trip','trip_time','total_distance','skyway_distance','avg_speed','std_speed', 'avg_speed_up','avg_speed_down', 'avg_acc','std_acc','avg_turn','std_turn','standing_time','standing_speed'] # save to file for later training raw_data.to_csv('training_set.csv', index=False) return raw_data
def predict(self, test_set=True, location=None): Y, self.locations = self.data.get_y(location=location) t = self.data.observations['time'].values t = self._split_dataset(t, test_set=test_set) Y = self._split_dataset(Y, test_set=test_set) yhat_jobs = [] ytrue =[] yoccur_jobs = [] if not self.nearest_neighbor: X = self.data.get_X() X = self._split_dataset(X, test_set=test_set) if self.xtransform is not None: X = self.xtrans.transform(X) for j, row in self.locations.iterrows(): if self.nearest_neighbor: X = self.data.get_nearest_X(row[self.data.reanalysis_latdim], row[self.data.reanalysis_londim]) X = self._split_dataset(X, test_set=test_set) if self.xtransform is not None: X = self.xtrans[j].transform(X) if self.conditional is not None: yoccur_jobs += [delayed(worker_predict_prob)(self.occurance_models[j], copy.deepcopy(X))] yhat_jobs += [delayed(worker_predict)(self.models[j], copy.deepcopy(X))] ytrue += [Y[:, j]] yhat = Parallel(n_jobs=self.num_proc)(yhat_jobs) if self.ytransform is not None: transform_jobs = [delayed(worker_invtrans)(self.ytrans[j], yhat[j]) for j in range(len(yhat))] yhat = Parallel(n_jobs=self.num_proc)(transform_jobs) yhat = numpy.vstack(yhat).T ytrue = numpy.vstack(ytrue).T yhat = self.to_xarray(yhat, t).rename({"value": "projected"}) ytrue = self.to_xarray(ytrue, t).rename({"value": "ground_truth"}) if self.conditional is not None: yoccur = Parallel(n_jobs=self.num_proc)(yoccur_jobs) yoccur = numpy.vstack(yoccur).T > 0.5 yoccur = self.to_xarray(yoccur, t).rename({"value": "occurance"}) yhat['projected'] = yhat['projected']*yoccur['occurance'] yhat = yhat.merge(yoccur) out = yhat.merge(ytrue) out['error'] = out.projected - out.ground_truth return out
def run_all(cnf, samples, process_one, finalize_one, finalize_all): if len(samples) == 1: sample_name, sample_cnf = samples.items()[0] run_one(sample_cnf, process_one, finalize_one) else: results = [] if cnf.get('parallel'): try: from joblib import Parallel, delayed except ImportError: critical( '\nERROR: Joblib not found. You may want samples to be processed ' 'in parallel, in this case, make sure python joblib intalled. ' '(pip install joblib).') else: for sample_name, sample_cnf in samples.items(): sample_cnf['verbose'] = False results = Parallel(n_jobs=len(samples)) \ (delayed(run_one)(sample_cnf, process_one, finalize_one, multiple_samples=True) for sample_name, sample_cnf in samples.items()) else: results = [] for sample_name, sample_cnf in samples.items(): results.append( run_one(sample_cnf, process_one, finalize_one, multiple_samples=True)) if samples: info('') info('*' * 70) info('Results for each sample:') finalize_all(cnf, samples, results) # Cleaning for name, data in samples.items(): work_dirpath = data['work_dir'] tx_dirpath = join(work_dirpath, 'tx') if isdir(tx_dirpath): shutil.rmtree(tx_dirpath) if not data.get('keep_intermediate') \ and isdir(work_dirpath): shutil.rmtree(work_dirpath)
def get_best_matches(img, ids, kps, des): number_of_results = 3 src_kp, src_des = get_kp_desc(img) os.remove(img) matches = [] matches = Parallel(n_jobs=-1)(delayed(match_gen)([kp_to_list(src_kp), src_des, kp_to_list(kps[i]), des[i], ids[i]]) for i in range(len(ids))) # remove product ids that have 0 matches for elem in matches[:]: if elem[1] == 0: matches.remove(elem) # sort by ids in order to remove duplicate ids for pics with less matches of the same product matches = sorted(matches, key=lambda tup: tup[0]) # and remove possible product id duplicates # that may appear from the match-making algorithm applied # on different picture keypoints of the same product s = set() for elem in matches[:]: if elem[0] in s: matches.remove(elem) else: s.add(elem[0]) # sort by number of matches matches = sorted(matches, key=lambda tup: tup[1]) matches.reverse() # return the first number_of_results most matching return [i for i in matches[:number_of_results]]
def svm_ova_from_kernel(ktrain, train_labels, ktest, test_labels, C=DEFAULT_REGULARIZATION, bkg_categories=None): def sighandler_svm(signum, frame): logger.info('Caught signal %i while training SVMs in paralell.' % signum) signal.signal(signal.SIGTERM, sighandler_svm) n_test = ktest.shape[0] categories = np.unique(train_labels) # -- remove background categories if bkg_categories is not None: categories = list(set(categories).difference(set(bkg_categories))) n_categories = len(categories) cat_index = {} predictions = np.empty((n_test, n_categories)) # -- train OVA SVMs in parallel predictions = Parallel(n_jobs=-1) (delayed(one_svm) (ktrain, train_labels.reshape(-1), ktest, cat, C) for cat in categories) predictions = np.array(predictions).T # -- iterates over categories for icat, cat in enumerate(categories): cat_index[cat] = icat gt = np.array([cat_index[e] for e in test_labels.reshape(-1)]).astype('int') pred = predictions.argmax(axis=1) acc = (pred == gt).sum() / float(n_test) return acc, predictions, gt
def auto_choose(demofile, new_xyz, only_original_segments): """ @param demofile: @param new_xyz: @param only_original_segments: if true, then only the original_segments will be registered with @return: """ import pprint """Return the segment with the lowest warping cost. Takes about 2 seconds.""" parallel = True if parallel: from joblib import Parallel, delayed items = demofile.items() if only_original_segments: #remove all derived segments from items print("Only registering with the original segments") items = [item for item in items if not "derived" in item[1].keys()] unzipped_items = zip(*items) keys = unzipped_items[0] values = unzipped_items[1] ds_clouds, shapes = get_downsampled_clouds(values) ds_new = clouds.downsample(new_xyz, 0.01 * DS_SIZE) #print 'ds_new_len shape', ds_new.shape if parallel: before = time.time() #TODO: change back n_jobs=12 ? costs = Parallel(n_jobs=8, verbose=0)(delayed(registration_cost)(ds_cloud, ds_new) for ds_cloud in ds_clouds) after = time.time() print "Parallel registration time in seconds =", after - before else: costs = [] for (i, ds_cloud) in enumerate(ds_clouds): costs.append(registration_cost(ds_cloud, ds_new)) print(("completed %i/%i" % (i + 1, len(ds_clouds)))) #print(("costs\n", costs)) ibest = np.argmin(costs) print "ibest = ", ibest #pprint.pprint(zip(keys, costs, shapes)) #print keys print "best key = ", keys[ibest] print "best cost = ", costs[ibest] return keys[ibest]
def findPeaks(imgdict, maplist, params, maptype="ccmaxmap", pikfile=True): peaktreelist = [] count = 0 thresh = float(params["thresh"]) bin = int(params["bin"]) diam = float(params["diam"]) apix = float(params["apix"]) olapmult = float(params["overlapmult"]) maxpeaks = int(params["maxpeaks"]) maxthresh = params["maxthresh"] maxsizemult = float(params["maxsize"]) peaktype = params["peaktype"] msg = not params['background'] pixdiam = diam/apix/float(bin) pixrad = diam/apix/2.0/float(bin) numpyVersion = float(numpy.version.version[:3]) if numpyVersion > 1.7: peaktreelist = Parallel(n_jobs=params['nproc'])(delayed(runFindPeaks)(params, maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,maxpeaks,maxsizemult, msg,bin,peaktype,pixrad,imgdict) for count in range(0,len(maplist))) else: ## backup for AttributeError: 'memmap' object has no attribute 'offset', bug #3322 peaktreelist = [] for count in range(0,len(maplist)): mappeaktree = runFindPeaks(params,maplist,maptype,pikfile,thresh,pixdiam,count,olapmult, maxpeaks,maxsizemult,msg,bin,peaktype,pixrad,imgdict) peaktreelist.append(mappeaktree) peaktree = mergePeakTrees(imgdict, peaktreelist, params, msg, pikfile) #max threshold if maxthresh is not None: precount = len(peaktree) peaktree = maxThreshPeaks(peaktree, maxthresh) postcount = len(peaktree) #if precount != postcount: apDisplay.printMsg("Filtered %d particles above threshold %.2f"%(precount-postcount,maxthresh)) return peaktree
def main(): """ Main function. 1. Setup logging 2. Get arguments 3. Get index 4. Process files 5. Write output """ setup_logging() logger = logging.getLogger("stats." + __name__) args = get_args() index = get_index(args) logger.warning("Positions not in annotation will be ignored.") logger.info("Found " + str(len(args.inputs)) + " input file(s):") for input_file in sorted(args.inputs): logger.debug(input_file) if args.is_parallel: stats = Parallel(n_jobs=args.parallel, verbose=100, batch_size=1)(delayed(process_file)(input_file, args.type, index, args.is_parallel) for input_file in args.inputs) else: stats = [] for input_file in args.inputs: output_table = process_file(input_file, args.type, index, args.is_parallel) stats.append(output_table) write_stats(args.out, stats)
def summary(self, count=32, out_table=None, prange=None, pjob=4): if self.ndims > 0: if_exists = "replace" if out_table == None: out_table = "%s_%s_summary" %(self.table, self.name) out_table = out_table.replace("[", "_").replace("]", "_") def query(i): self.logger.info("Processing column %d (of %d)" %(i, self.shape[0])) query = self[i].data.alias(name="col") q1 = sa.select([sa.text("madlib.fmsketch_dcount(col) as count"), sa.text("madlib.mfvsketch_top_histogram(col, %s) as top" %count)]).select_from(query) return [q1, i] if prange == None: prange = range(1, self.shape[0] + 1) queries = [query(i) for i in prange] dfs = Parallel(n_jobs=pjob)(delayed(process_query)(q) for q in queries) dfs = pd.concat(dfs) dfs.index = prange dfs["table"] = self.table dfs["column"] = self.name return dfs
def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts', minlen=3, drop_gamma=False, n_jobs='auto'): ''' Finds TADs in data with a list of gammas. Returns a pandas DataFrame with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on the returned object to get coordinates in bed-style format and not in coordinates of concatenated genome. If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma) ''' raise DeprecationWarning('Will be deprecated or rewritten to use'\ 'lavaburst: github.com/nezar-compbio/lavaburst') if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs if segmentation == 'potts': n_jobs = 3 elif segmentation == 'armatus': n_jobs = 6 if ~np.isfinite(data).any(): print 'Non-finite values in data, substituting them with zeroes' data[~np.isfinite(data)] = 0 Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data) f = _calculate_TADs if n_jobs >= 1: from joblib import Parallel, delayed domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)( delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation) for g in gammalist) elif n_jobs is None or n_jobs == False or n_jobs == 0: domains = [] for g in gammalist: domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation) domains.append(domains_g) domains = pd.concat(domains, ignore_index=True) domains = domains.query('End-Start>='+str(minlen)).copy() domains = domains.sort(columns=['Gamma', 'Start', 'End']) domains.reset_index(drop=True, inplace=True) domains[['Start', 'End']] = domains[['Start', 'End']].astype(int) domains[['Start', 'End']] *= self.resolution domains = domains[['Start', 'End', 'Score', 'Gamma']] if drop_gamma: domains.drop('Gamma', axis=1, inplace=True) domains = self.genome_intervals_to_chr(domains).reset_index(drop=True) return domains
out_scores[key] = scores out_scores['repeat_idx'] = repeat_idx out_predictions['repeat_idx'] = repeat_idx return out_scores, out_predictions DEBUG = False if DEBUG: N_JOBS = 1 stacked_keys = {'MEG all': meg_powers + meg_cross_powers + meg_handcrafted} drop_na_scenario = (False, 'local', 'global') for drop_na in drop_na_scenario[:1 if DEBUG else len(drop_na_scenario)]: out = Parallel(n_jobs=N_JOBS)(delayed(run_stacked)(data.query( f"repeat == {ii}"), stacked_keys, ii, drop_na) for ii in range(N_REPEATS)) out = zip(*out) out_scores_meg = next(out) out_scores_meg = pd.concat(out_scores_meg, axis=0) out_scores_meg.to_csv(SCORES.format('meg' + drop_na if drop_na else '_na_coded'), index=True) out_predictions_meg = next(out) out_predictions_meg = pd.concat(out_predictions_meg, axis=0) out_predictions_meg.to_csv( OUT_PREDICTIONS.format('meg' + drop_na if drop_na else '_na_coded'), index=True)
def optimze_func(start_pt): result_x, result_f, output = scipy.optimize.fmin_l_bfgs_b( func=negative_ei_func, x0=start_pt, fprime=None, args=(), approx_grad=True, bounds=obj_func_min._search_domain, m=10, factr=10.0, pgtol=1e-10, epsilon=1e-08, iprint=-1, maxfun=15000, maxiter=15000, disp=0, callback=None) print output return result_x, result_f with Parallel(n_jobs=50) as parallel: parallel_results = parallel( delayed(optimze_func)(pt) for pt in start_points) min_negative_ei = numpy.inf for i in range(len(parallel_results)): if min_negative_ei > parallel_results[i][1]: min_negative_ei = parallel_results[i][1] best_pt = parallel_results[i][0]
parser.add_argument('--batch_size', type=int, default=1000) args = parser.parse_args() meta = pd.read_csv(args.meta_file) object_ids = meta['object_id'].unique() if not os.path.exists(args.temporary_directory): os.mkdir(args.temporary_directory) object_id_batches = [] object_id_batch_count = int( math.ceil(len(object_ids) / float(args.batch_size))) for batch in range(object_id_batch_count): batch_ids = object_ids[batch * args.batch_size:][:args.batch_size] object_id_batches.append(batch_ids) signal_reader = SignalReader(args.signal_file) fft_features_files = Parallel(n_jobs=args.process_count)( delayed(extract_df_features) (pickle.dumps(signal_reader.objects_signals(objects_ids)), os.path.join(args.temporary_directory, 'batch-{0}.csv'.format(batch))) for batch, objects_ids in enumerate(tqdm(object_id_batches))) signal_reader.close() assert len(fft_features_files) > 0 features = pd.read_csv(fft_features_files[0]) for filename in tqdm(fft_features_files[1:]): features = pd.concat([features, pd.read_csv(filename)], sort=True) os.remove(filename) features.to_csv(args.target_file, index=None)
def compute_metrics(label, pred): res = Parallel(n_jobs=16)(delayed(corr)(pred, label, i) for i in range(label.shape[1])) return res
def aTimes2CorrsParallel(data, listOfCorr, accuracy=50, taumax="auto", performCoarsening=True, split=10): """ Calculate correlations between several photon streams with arrival times stored in macrotimes, using parallel computing to speed up the process ========== =============================================================== Input Meaning ---------- --------------------------------------------------------------- data Object having fields det0, det1, ..., det24 which contain the macrotimes of the photon arrivals [in a.u.] listOfCorr List of correlations to calculate split Chunk size [s] ========== =============================================================== Output Meaning ---------- --------------------------------------------------------------- G [N x 2] matrix with tau and G values ========== =============================================================== """ if taumax == "auto": taumax = 1 / data.macrotime G = correlations() Ndet = 21 calcAv = False if 'av' in listOfCorr: # calculate the correlations of all channels and calculate average listOfCorr.remove('av') listOfCorr += list(range(Ndet)) calcAv = True for corr in listOfCorr: print("Calculating correlation " + str(corr)) # EXTRACT DATA if type(corr) == int: dataExtr = getattr(data, 'det' + str(corr)) t0 = dataExtr[:, 0] corrname = 'det' + str(corr) elif corr == "sum5" or corr == "sum3": print("Extracting and sorting photons") dataExtr = extractSpadPhotonStreams(data, corr) t0 = dataExtr[:, 0] corrname = corr # CALCULATE CORRELATIONS duration = t0[-1] * data.macrotime Nchunks = int(np.floor(duration / split)) # go over all filters for j in range(np.shape(dataExtr)[1] - 1): print(" Filter " + str(j)) if j == 0: Processed_list = Parallel(n_jobs=multiprocessing.cpu_count() - 1)(delayed(parallelG)(t0, [1], data.macrotime, j, split, accuracy, taumax, performCoarsening, chunk) for chunk in list(range(Nchunks))) else: w0 = dataExtr[:, j+1] Processed_list = Parallel(n_jobs=multiprocessing.cpu_count() - 1)(delayed(parallelG)(t0, w0, data.macrotime, j, split, accuracy, taumax, performCoarsening, chunk) for chunk in list(range(Nchunks))) for chunk in range(Nchunks): setattr(G, corrname + "F" + str(j) + '_chunk' + str(chunk), Processed_list[chunk]) # average over all chunks listOfFields = list(G.__dict__.keys()) listOfFields = [i for i in listOfFields if i.startswith(corrname + "F" + str(j) + "_chunk")] Gav = sum(getattr(G, i) for i in listOfFields) / len(listOfFields) setattr(G, corrname + "F" + str(j) + '_average', Gav) if calcAv: # calculate average correlation of all detector elements for f in range(np.shape(dataExtr)[1] - 1): # start with correlation of detector 20 (last one) Gav = getattr(G, 'det' + str(Ndet-1) + 'F' + str(f) + '_average') # add correlations detector elements 0-19 for det in range(Ndet - 1): Gav += getattr(G, 'det' + str(det) + 'F' + str(f) + '_average') # divide by the number of detector elements to get the average Gav = Gav / Ndet # store average in G setattr(G, 'F' + str(f) + '_average', Gav) return G
px = int((mx - gt[0]) / gt[1]) #x pixel py = int((my - gt[3]) / gt[5]) #y pixel ############### print "[ RASTER BAND COUNT ]: ", src_ds.RasterCount for band in range( src_ds.RasterCount ): band += 1 srcband = src_ds.GetRasterBand(band) structval = srcband.ReadRaster(px,py,1,1,buf_type=srcband.DataType ) bandtype = gdal.GetDataTypeName(srcband.DataType) intval = struct.unpack(fmttypes[bandtype] , structval) val=int(intval[0])*0.1 values.append(var+"_"+str(i)+"-"+str(band)+"_"+str(val)) return values num_cores = multiprocessing.cpu_count() results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in range (yi, yf + 1, 1)) print results # mx,my=float(lon), float(lat) #coord in map units # mx,my=-74.930451, 5.363346 #coord in map units # i=205 # values=[] # for var in varslist: # filetif= dirbase +"//"+var+"//"+var+"_"+str(i)+ ".tif" # if os.path.exists(filetif): # print filetif # src_filename = filetif # src_ds=gdal.Open(src_filename) # gt=src_ds.GetGeoTransform() # px = int((mx - gt[0]) / gt[1]) #x pixel
def main(): parser = argparse.ArgumentParser(description='Register & align images') parser.add_argument( 'filenames', nargs='+', help= 'List of target files to register. Images are aligned to first in list.' ) parser.add_argument('-odir', metavar='outdir', required=True, type=str, help='Output directory for files.') parser.add_argument( '-m', metavar='method', choices=('point', 'extended'), default='extended', help='Specify alignment method (point or extended); default=extended.') parser.add_argument( '-xy', nargs=2, type=float, default=None, help= 'Specify approximate "x y" pixel coordinate of object to centroid on. Required for point mode; useful for extended mode (default=center of image).' ) parser.add_argument( '-box', nargs=2, type=int, default=None, help= 'Specify box size (w h) to restrict alignment search. Useful for both point & extended modes (default=full size of array).' ) parser.add_argument('--c', action='store_true', help='Clobber (overwrite) on output') parser.add_argument( '-njobs', type=int, default=1, help='Process images in parallel. "-1" is all CPUs (default=1).') args = parser.parse_args() if args.m == 'point' and args.xy is None: parser.error("-m point requires -xy coordinate") # create output directory if args.odir not in ['', '.']: makedirs(args.odir, exist_ok=True) # align all images to first filename ref = args.filenames[0] align = args.filenames[1:] imref = partial(register, ref=ref, outdir=args.odir, method=args.m, center=args.xy, size=args.box, overwrite=args.c) outfiles = Parallel(n_jobs=args.njobs, verbose=11)(delayed(imref)(toshift=a) for a in align) # Write ref to outdir refnew = os.path.join(args.odir, os.path.basename(ref)) copy(ref, refnew) outfiles.append(refnew) print('Wrote %i files to %s' % (len(outfiles), args.odir))
def mercat_main(): __args__, m_parser = parseargs() kmer = __args__.k num_cores = __args__.n m_inputfile = __args__.i m_inputfolder = __args__.f prune_kmer = __args__.c mflag_fastq = __args__.q mflag_prodigal = __args__.p mflag_trimmomatic = __args__.t mflag_protein = __args__.pro mfile_size_split = __args__.s kmerstring = str(kmer) + "-mers" if not mfile_size_split: mfile_size_split = 100 np_string = "nucleotide" if mflag_protein or mflag_prodigal: np_string = "protein" def_option = not __args__.p and not __args__.q and not __args__.pro all_ipfiles = [] if m_inputfolder: m_inputfolder = os.path.abspath(m_inputfolder) os.chdir(m_inputfolder) #Assume all have same ext for fname in os.listdir(m_inputfolder): mip = os.path.join(m_inputfolder, fname) if not os.path.isdir(mip): # skip directories all_ipfiles.append(mip) else: #m_inputfolder = os.getcwd() m_inputfolder = os.path.dirname(os.path.abspath(m_inputfile)) all_ipfiles.append(os.path.abspath(m_inputfile)) top10_all_samples = dict() for m_inputfile in all_ipfiles: os.chdir(m_inputfolder) check_args(m_inputfile,__args__,def_option,m_parser) m_inputfile = os.path.abspath(m_inputfile) sample_name = os.path.splitext(os.path.basename(m_inputfile))[0] basename_ipfile = os.path.splitext(os.path.basename(m_inputfile))[0] + "_" + np_string inputfile_size = os.stat(m_inputfile).st_size dir_runs = "mercat_results/" + basename_ipfile + "_run" if os.path.exists(dir_runs): shutil.rmtree(dir_runs) os.makedirs(dir_runs) all_chunks_ipfile = [] is_chunked = False if inputfile_size >= (mfile_size_split*1024*1024): #100MB print("Large input file provided: Splitting it into smaller files...\n") mercat_chunker(m_inputfile,dir_runs,str(mfile_size_split)+"M",">") os.chdir(dir_runs) all_chunks_ipfile = glob.glob("*") is_chunked=True else: os.chdir(dir_runs) all_chunks_ipfile.append(m_inputfile) #print all_chunks_ipfile #sys.exit(1) splitSummaryFiles = [] for inputfile in all_chunks_ipfile: bif = os.path.splitext(os.path.basename(inputfile))[0] + "_" + np_string '''trimmomatic SE -phred33 test.fq Out.fastq ILLUMINACLIP:TruSeq2-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:30 MINLEN:50''' if mflag_trimmomatic: swq = mflag_trimmomatic trimmed_file = bif+"_trimmed.fq" prod_cmd = "trimmomatic SE -phred33 %s %s ILLUMINACLIP:TruSeq2-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:%s MINLEN:50" %(inputfile,trimmed_file,swq) with open(os.devnull, 'w') as FNULL: subprocess.call(prod_cmd, stdout=FNULL, stderr=FNULL, shell=True) inputfile = trimmed_file "Run prodigal if specified" '''prodigal -i test_amino-acid.fa -o output.gff -a output.orf_pro.faa -f gff -p meta -d output.orf_nuc''' if mflag_prodigal: mflag_protein = True gen_protein_file = bif+"_pro.faa" prod_cmd = "prodigal -i %s -o %s -a %s -f gff -p meta -d %s" % ( inputfile, bif + ".gff", gen_protein_file, bif + "_nuc.ffn") if mflag_fastq and mflag_trimmomatic: trimfna = bif + "_trimmed.fna" sequences = OrderedDict() with open(inputfile, 'r') as f: seq = "" sname = "" for line in f: line = line.strip() if line.startswith("@"): sname = line[1:].split()[0] elif line.startswith("+"): if seq: sequences[sname] = seq seq = "" else: if sname not in sequences: seq = line fnastring = "" for sname in sequences: fnastring += ">"+sname+"\n" fnastring += sequences[sname]+"\n" with open(trimfna, 'w') as f: f.write(fnastring) gen_protein_file = bif + "_trimmed_pro.faa" prod_cmd = "prodigal -i %s -o %s -a %s -f gff -p meta -d %s" % ( trimfna, bif + ".gff", gen_protein_file, bif + "_nuc.ffn") print(prod_cmd) with open(os.devnull, 'w') as FNULL: subprocess.call(prod_cmd, stdout=FNULL, stderr=FNULL, shell=True) inputfile = gen_protein_file print("Running mercat using " + str(num_cores) + " cores") print("input file: " + inputfile) sequences = OrderedDict() is_fastq = False start_time = timeit.default_timer() with open(inputfile,'r') as f: for line in f: if line.startswith(">"): break elif line.startswith("@"): is_fastq = True break with open(inputfile,'r') as f: if not is_fastq: seq = "" sname = "" for line in f: line = line.strip() if line.startswith(">"): if sname: sequences[sname] = "" if seq: sequences[sname] = seq seq = "" sname = line[1:] sname = sname.split("#",1)[0].strip() else: line = line.replace("*","") seq += line #assert sname and seq sequences[sname] = seq else: #process fastq file seq = "" sname = "" for line in f: line = line.strip() if line.startswith("@"): sname = line[1:].split()[0] elif line.startswith("+"): if seq: sequences[sname] = seq seq = "" else: if sname not in sequences: seq = line #print sequences.keys()[0] + "="+ sequences.values()[0] print("Number of sequences in " + inputfile + " = "+ str(humanize.intword(len(sequences)))) # results = Parallel(n_jobs=num_cores)( # delayed(calculateKmerCount)(seq, sequences[seq], prune_kmer, kmer) for seq in sequences) results = Parallel(n_jobs=num_cores)( delayed(calculateKmerCount)(sequences[seq], kmer) for seq in sequences) kmerlist = dict() #kmerlist_all_seq = dict() for d in results: for k,v in list(d.items()): if k in kmerlist: kmerlist[k] += v else: kmerlist[k] = v # for d in results: # for seq,kdict in list(d[1].items()): # #assert seq not in kmerlist_all_seq # kmerlist_all_seq[seq] = kdict#.copy() print("Time to compute " + kmerstring + ": " + str(round(timeit.default_timer() - start_time,2)) + " secs") significant_kmers = [] for k in kmerlist: if kmerlist[k] >= prune_kmer: significant_kmers.append(k) print("Total number of " + kmerstring + " found: " + str(humanize.intword(len(kmerlist)))) print(kmerstring + " with count >= " + str(prune_kmer) + ": " + str(humanize.intword(len(significant_kmers)))) #df = df.ix[df[bif] >= prune_kmer] if mflag_protein: df = pd.DataFrame(0.0, index=significant_kmers, columns=['Count',"PI","MW","Hydro"]) for k in significant_kmers: df.set_value(k, 'Count', kmerlist[k]) df.set_value(k,"PI", predict_isoelectric_point_ProMoST(k)) df.set_value(k, "MW", calculate_MW(k)) df.set_value(k, "Hydro", calculate_hydro(k)) df.to_csv(bif + "_summary.csv", index_label=kmerstring, index=True) else: df = pd.DataFrame(0, index=significant_kmers, columns=['Count',"GC_Percent","AT_Percent"]) for k in significant_kmers: c_kmer = k df.set_value(k, 'Count', kmerlist[k]) len_cseq = float(len(c_kmer)) df.set_value(k, "GC_Percent", round(((c_kmer.count("G")+c_kmer.count("C")) / len_cseq) * 100.0)) df.set_value(k, "AT_Percent", round(((c_kmer.count("A")+c_kmer.count("T")) / len_cseq) * 100.0)) df.to_csv(bif + "_summary.csv", index_label=kmerstring, index=True) splitSummaryFiles.append(bif + "_summary.csv") # dfcol = significant_kmers # # # if not mflag_protein: # dfcol.extend(["length","GC_Percent","AT_Percent"]) # # df = pd.DataFrame(0,index=list(sequences.keys()),columns=dfcol) # # for seq in sequences: # cseq = sequences[seq] # len_cseq = float(len(cseq)) # df.set_value(seq, "length", int(len_cseq)) # df.set_value(seq, "GC_Percent", round(((cseq.count("G")+cseq.count("C")) / len_cseq) * 100.0)) # df.set_value(seq, "AT_Percent", round(((cseq.count("A")+cseq.count("T")) / len_cseq) * 100.0)) # for ss in kmerlist_all_seq[seq]: # df.set_value(seq, ss, kmerlist_all_seq[seq][ss]) # # #df = df.loc[:, df.max() >= prune_kmer] # df1 = df.ix[:,['length','GC_Percent','AT_Percent']] # del df['length'] # del df['GC_Percent'] # del df['AT_Percent'] # df = df.loc[:, df.max() >= prune_kmer] # df.loc[:, 'length'] = df1.ix[:,'length'] # df.loc[:, 'GC_Percent'] = df1.ix[:,'GC_Percent'] # df.loc[:, 'AT_Percent'] = df1.ix[:,'AT_Percent'] # # # else: # # dfcol.extend(["length", "PI", "MW","Hydro"]) # # df = pd.DataFrame(0, index=list(sequences.keys()), columns=dfcol) # # for seq in sequences: # cseq = sequences[seq] # cseq=cseq.replace('*','') # len_cseq = float(len(cseq)) # df.set_value(seq, "length", int(len_cseq)) # df.set_value(seq, "PI", predict_isoelectric_point_ProMoST(cseq)) # df.set_value(seq, "MW", calculate_MW(cseq)) # df.set_value(seq, "Hydro", calculate_hydro(cseq)) # for ss in kmerlist_all_seq[seq]: # df.set_value(seq, ss, kmerlist_all_seq[seq][ss]) # # #df = df.loc[:,df.max() >= prune_kmer] # df1 = df.ix[:,['length','PI','MW','Hydro']] # del df['length'] # del df['PI'] # del df['MW'] # del df['Hydro'] # df = df.loc[:, df.max() >= prune_kmer] # df.loc[:, 'length'] = df1.ix[:,'length'] # df.loc[:, 'PI'] = df1.ix[:,'PI'] # df.loc[:, 'MW'] = df1.ix[:,'MW'] # df.loc[:, 'Hydro'] = df1.ix[:, 'Hydro'] # # df.to_csv(bif+".csv",index_label='Sequence',index=True) print("Total time: " + str(round(timeit.default_timer() - start_time,2)) + " secs") num_chunks = len(all_chunks_ipfile) df = dd.read_csv(splitSummaryFiles) dfgb = df.groupby(kmerstring).sum() df10 = dfgb.nlargest(10,'Count').compute() dfsum = dfgb.sum(0).compute() dfgb.to_csv("./" + basename_ipfile + "_finalSummary*.csv", index_label=kmerstring, name_function=name) if mflag_protein: df10[['PI', 'MW', 'Hydro']] = df10[['PI', 'MW', 'Hydro']] / num_chunks else: df10[['GC_Percent', 'AT_Percent']] = df10[['GC_Percent', 'AT_Percent']] / num_chunks top10_all_samples[sample_name] = [df10,dfsum.Count] all_counts = dfgb.Count.values.compute().astype(int) mercat_compute_alpha_beta_diversity(all_counts,basename_ipfile) if is_chunked: for tempfile in all_chunks_ipfile: os.remove(tempfile) for sf in splitSummaryFiles: os.remove(sf) plots_dir = m_inputfolder+"/mercat_results/plots" if os.path.exists(plots_dir): shutil.rmtree(plots_dir) os.makedirs(plots_dir) os.chdir(plots_dir) for basename_ipfile in top10_all_samples: df10,_ = top10_all_samples[basename_ipfile] if mflag_protein: mercat_scatter_plots(basename_ipfile, 'PI', df10, kmerstring) mercat_scatter_plots(basename_ipfile, 'MW', df10, kmerstring) mercat_scatter_plots(basename_ipfile, 'Hydro', df10, kmerstring) else: mercat_scatter_plots(basename_ipfile, 'GC_Percent', df10, kmerstring) mercat_scatter_plots(basename_ipfile, 'AT_Percent', df10, kmerstring) sbname = os.path.basename(m_inputfolder) if len(all_ipfiles) == 1: sbname = os.path.basename(all_ipfiles[0]) mercat_stackedbar_plots(sbname,top10_all_samples, 'Count', kmerstring)
def anisotropic_smooth(inpd, fiber_distance_threshold, points_per_fiber=30, n_jobs=2, cluster_max = 10): """ Average nearby fibers. The pairwise fiber distance matrix is computed, then fibers are averaged with their neighbors until an edge (>max_fiber_distance) is encountered. """ # polydata to array conversion, fixed-length fiber representation current_fiber_array = fibers.FiberArray() current_fiber_array.points_per_fiber = points_per_fiber current_fiber_array.convert_from_polydata(inpd) original_number_of_fibers = current_fiber_array.number_of_fibers # fiber list data structure initialization for easy fiber averaging curr_count = list() curr_fibers = list() curr_indices = list() for lidx in range(0, current_fiber_array.number_of_fibers): curr_fibers.append(current_fiber_array.get_fiber(lidx)) curr_count.append(1) curr_indices.append(list([lidx])) converged = False iteration_count = 0 while not converged: print "<filter.py> ITERATION:", iteration_count, "SUM FIBER COUNTS:", numpy.sum(numpy.array(curr_count)) print "<filter.py> number indices", len(curr_indices) # fiber data structures for output of this iteration next_fibers = list() next_count = list() next_indices = list() # information for this iteration done = numpy.zeros(current_fiber_array.number_of_fibers) fiber_indices = range(0, current_fiber_array.number_of_fibers) # if the maximum number of fibers have been combined, stop averaging this fiber done[numpy.nonzero(numpy.array(curr_count) >= cluster_max)] = 1 # pairwise distance matrix if USE_PARALLEL: distances = Parallel(n_jobs=n_jobs, verbose=1)( delayed(similarity.fiber_distance)( current_fiber_array.get_fiber(lidx), current_fiber_array, 0, 'Hausdorff') for lidx in fiber_indices) distances = numpy.array(distances) else: distances = \ numpy.zeros( (current_fiber_array.number_of_fibers, current_fiber_array.number_of_fibers)) for lidx in fiber_indices: distances[lidx, :] = \ similarity.fiber_distance( current_fiber_array.get_fiber(lidx), current_fiber_array, 0, 'Hausdorff') # distances to self are not of interest for lidx in fiber_indices: distances[lidx,lidx] = numpy.inf # sort the pairwise distances. distances_flat = distances.flatten() pair_order = numpy.argsort(distances_flat) print "<filter.py> DISTANCE MIN:", distances_flat[pair_order[0]], \ "DISTANCE COUNT:", distances.shape # if the smallest distance is greater or equal to the # threshold, we have converged if distances_flat[pair_order[0]] >= fiber_distance_threshold: converged = True print "<filter.py> CONVERGED" break else: print "<filter.py> NOT CONVERGED" # loop variables idx = 0 pair_idx = pair_order[idx] number_of_fibers = distances.shape[0] number_averages = 0 # combine nearest neighbors unless done, until hit threshold while distances_flat[pair_idx] < fiber_distance_threshold: # find the fiber indices corresponding to this pairwise distance # use div and mod f_row = pair_idx / number_of_fibers f_col = pair_idx % number_of_fibers # check if this neighbor pair can be combined combine = (not done[f_row]) and (not done[f_col]) if combine : done[f_row] += 1 done[f_col] += 1 # weighted average of the fibers (depending on how many each one represents) next_fibers.append( (curr_fibers[f_row] * curr_count[f_row] + \ curr_fibers[f_col] *curr_count[f_col]) / \ (curr_count[f_row] + curr_count[f_col])) # this was the regular average #next_fibers.append((curr_fibers[f_row] + curr_fibers[f_col])/2) next_count.append(curr_count[f_row] + curr_count[f_col]) number_averages += 1 #next_indices.append(list([curr_indices[f_row], curr_indices[f_col]])) next_indices.append(list(curr_indices[f_row] + curr_indices[f_col])) # increment for the loop idx += 1 pair_idx = pair_order[idx] # copy through any unvisited (already converged) fibers unvisited = numpy.nonzero(done==0)[0] for fidx in unvisited: next_fibers.append(curr_fibers[fidx]) next_count.append(curr_count[fidx]) next_indices.append(curr_indices[fidx]) # set up for next iteration curr_fibers = next_fibers curr_count = next_count curr_indices = next_indices iteration_count += 1 # set up array for next iteration distance computation current_fiber_array = fibers.FiberArray() current_fiber_array.number_of_fibers = len(curr_fibers) current_fiber_array.points_per_fiber = points_per_fiber dims = [current_fiber_array.number_of_fibers, current_fiber_array.points_per_fiber] # fiber data current_fiber_array.fiber_array_r = numpy.zeros(dims) current_fiber_array.fiber_array_a = numpy.zeros(dims) current_fiber_array.fiber_array_s = numpy.zeros(dims) curr_fidx = 0 for curr_fib in curr_fibers: current_fiber_array.fiber_array_r[curr_fidx] = curr_fib.r current_fiber_array.fiber_array_a[curr_fidx] = curr_fib.a current_fiber_array.fiber_array_s[curr_fidx] = curr_fib.s curr_fidx += 1 print "<filter.py> SUM FIBER COUNTS:", numpy.sum(numpy.array(curr_count)), "SUM DONE FIBERS:", numpy.sum(done) print "<filter.py> MAX COUNT:" , numpy.max(numpy.array(curr_count)), "AVGS THIS ITER:", number_averages # when converged, convert output to polydata outpd = current_fiber_array.convert_to_polydata() # color output by the number of fibers that each output fiber corresponds to outcolors = vtk.vtkFloatArray() outcolors.SetName('FiberTotal') for count in curr_count: outcolors.InsertNextTuple1(count) outpd.GetCellData().SetScalars(outcolors) # also color the input pd by output cluster number cluster_numbers = numpy.zeros(original_number_of_fibers) cluster_count = numpy.zeros(original_number_of_fibers) cluster_idx = 0 for index_list in curr_indices: indices = numpy.array(index_list).astype(int) cluster_numbers[indices] = cluster_idx cluster_count[indices] = curr_count[cluster_idx] cluster_idx += 1 outclusters = vtk.vtkFloatArray() outclusters.SetName('ClusterNumber') for cluster in cluster_numbers: outclusters.InsertNextTuple1(cluster) inpd.GetCellData().AddArray(outclusters) inpd.GetCellData().SetActiveScalars('ClusterNumber') return outpd, numpy.array(curr_count), inpd, cluster_numbers, cluster_count
def strat_minpath(sample_id, strat_input, minpath_map, out_dir, pathway_db, gap_fill=True, per_sequence_contrib=False, print_opt=False, proc=1): '''Read in sample_id, gene family table, and out_dir, and run MinPath based on the gene family abundances. Returns both unstratified and stratified pathway abundances as dictionaries in a list. Will compute the simplistic "community-wide contributions" for stratitifed output unless per_sequence_contrib=True, which in contrast will cause MinPath to be run for each sequence. Also returns the coverage of each unstratified pathway as the a different dictionary in a list when per_sequence_contrib=True.''' # Get gene family abundances summed over all sequences for this sample. unstrat_input = strat_to_unstrat_counts(strat_input) pathways_present, reaction_abun = minpath_wrapper(sample_id, unstrat_input, minpath_map, out_dir, print_opt) # Initialize series and dataframe that will contain pathway abundances and # coverage scores. unstrat_abun = pd.Series() unstrat_cov = pd.Series() strat_abun = pd.Series() strat_cov = pd.Series() # Return empty series if no pathways are present. if len(pathways_present) == 0: return ([unstrat_abun, unstrat_cov, pd.Series(), pd.Series()]) # Get median reaction/gene family abundance for sample, which is used for # calculating coverage. median_abun = calc_median_reaction_abun(reaction_abun, pathways_present, pathway_db) # Loop through all pathways present and get abundance and coverage. for pathway in pathways_present: # Get ALL reactions in pathway (which could include optional ones). reactions = pathway_db.find_reactions(pathway) # Get abundances of all of these reactions. path_reaction_abun = { reaction_id: reaction_abun[reaction_id] for reaction_id in reactions } # Get pathway abundance and coverage pathway_abun, pathway_cov = pathway_abun_and_coverage( pathway, pathway_db, path_reaction_abun, median_abun) if pathway_abun == 0: continue # Add these values to each respective pandas Series. unstrat_abun[pathway] = pathway_abun unstrat_cov[pathway] = pathway_cov if not per_sequence_contrib: # If --per_sequence_contrib not set then get stratified pathway # abundances simply by weighting community-wide pathway abundances # by the abundances of all the predicted abundances of reactions in # these pathways contributed by each sequence (i.e. predicted # genome) strat_path_abun = path_abun_weighted_by_seq( strat_input, reactions, sum(list(path_reaction_abun.values())), unstrat_abun[pathway], pathway) strat_abun = pd.concat([strat_abun, strat_path_abun]) if per_sequence_contrib: # Loop over all sequences and get pathway abundances and coverages # for each sequence individually. This step will be run in parallel if # possible. strat_seq_out = Parallel(n_jobs=proc)(delayed(unstrat_minpath_for_seq)( seq, sample_id, strat_input[strat_input['sequence'] == seq].copy(), minpath_map, out_dir, pathway_db, gap_fill, print_opt, "_" + seq) for seq in set(strat_input['sequence'])) # Parse out the per-seq abundance and coverage outputs into different # lists. seq_strat_abun = [] seq_strat_cov = [] for seq_out in strat_seq_out: seq_strat_abun.append(seq_out[0]) seq_strat_cov.append(seq_out[1]) # Concatenate these per-sequence values to the stratified series. strat_abun = pd.concat(seq_strat_abun) strat_cov = pd.concat(seq_strat_cov) # Return unstratified and stratified abundances and coverage scores. return ([unstrat_abun, unstrat_cov, strat_abun, strat_cov])
test_acc_list = [] for i in range(iter): X_train = np.load('final_train_binarydata_' + str(i) + '.npy') Y_train = np.load('final_train_labels_' + str(i) + '.npy') X_train = X_train.astype('float') X_train = normalize(X_train) Y_train = Y_train.astype('float') Y_train = Y_train.astype(int) randomCombinations = random.sample(list(ParameterGrid(grid)), numSamples) print("parallel loop started") r = Parallel(n_jobs=-2, verbose=10)( delayed(Stratified_kfold)(X_train, Y_train, combination) for combination in randomCombinations) combination, score, train_acc, test_acc = zip(*r) combination_list = list(combination) score_list = list(score) trainacclist = list(train_acc) testacclist = list(test_acc) req_idx = score_list.index(max(score_list)) train_acc_list.append(trainacclist[req_idx]) test_acc_list.append(testacclist[req_idx]) bestparamdict[str(i)] = combination_list[req_idx] print('Train acc = ' + str(sum(train_acc_list) / iter))
def gen_data(lst_type, img_lst_patients, img_lst_candidates, gen_candidates_json, resample_lungs_json, n_candidates, crop_raw_scan_buffer, new_data_type, new_candidates_shape_zyx, new_spacing_zyx): n_threads = pipe.n_CPUs n_junks = int(np.ceil(len(img_lst_patients) / n_threads)) pipe.log.info('processing ' + str(n_junks) + ' junks with ' + str(n_threads) + ' patients each') HU_tissue_range = pipe.load_json('params.json', 'resample_lungs')['HU_tissue_range'] n_candidates_gen = pipe.load_json('params.json', 'gen_candidates')['n_candidates'] cand_line_num = 0 for junk_cnt in range(n_junks): junk = [] for in_junk_cnt in range(n_threads): line_num = n_threads * junk_cnt + in_junk_cnt if line_num >= len(img_lst_patients): break junk.append(line_num) pipe.log.info('processing junk ' + str(junk_cnt)) # heterogenous spacing -> homogeneous spacing junk_lst = Parallel(n_jobs=min([n_threads, len(junk)]))( delayed(gen_patients_candidates) (line_num, img_lst_patients, gen_candidates_json, resample_lungs_json, n_candidates, crop_raw_scan_buffer, new_data_type, new_candidates_shape_zyx, new_spacing_zyx, HU_tissue_range) for line_num in junk) for junk_result in junk_lst: patient, patient_label, images, prob_maps = junk_result # take n_candidates or less images = np.array(images, dtype=np.int16)[:n_candidates] prob_maps = np.array(prob_maps, dtype=np.uint8)[:n_candidates] if new_data_type == 'uint8': images = (images / (float(HU_tissue_range[1] - HU_tissue_range[0])) * 255).astype(np.uint8) # [0, 255] elif new_data_type == 'float32': images = (images / (float(HU_tissue_range[1] - HU_tissue_range[0])) - 0.25).astype(np.float32) # [-0.25, 0.75] prob_maps = (prob_maps / 255).astype(np.float32) # [0.0, 1.0] images_and_prob_maps = np.concatenate([images, prob_maps], axis=4).astype(new_data_type) path = pipe.save_array(patient + '.npy', images_and_prob_maps) with open(pipe.get_step_dir() + lst_type + '_patients.lst', 'a') as f: f.write('{}\t{}\t{}\n'.format(patient, patient_label, os.path.abspath(path))) if pipe.dataset_name == 'LUNA16': with open(pipe.get_step_dir() + lst_type + '_candidates.lst', 'a') as f: for cnt in range(images.shape[0]): cand = patient + '_' + str(cnt) img_lst_candidates[img_lst_candidates[0] == cand][1].values.tolist() cand_label = img_lst_candidates[ img_lst_candidates[0] == cand][1].values.tolist() if len(cand_label) == 0: cand_label = 0 else: cand_label = cand_label[0] if not cand.startswith(patient): raise ValueError(cand + ' needs to start with ' + patient) f.write('{}\t{}\t{}\n'.format(cand, cand_label, os.path.abspath(path)))
def main(): if len(sys.argv) >= 3: # read in the top 100 ranks from txt file rank100file = sys.argv[1] saveDir = sys.argv[2] # baseDir = sys.argv[5] # if len(sys.argv) < 6: # baseDir = None if len(sys.argv) < 5: jobName = "" else: jobName = sys.argv[4] if len(sys.argv) < 4: numRanks = 100 else: numRanks = int(sys.argv[3]) if jobName == "" or jobName == " ": jobName = "default" # Get the arguments that tell me what job number I am, and how many total jobs there are totalTasks = 1 if len(sys.argv) > 5: totalTasks = int(sys.argv[5]) taskNumber = 0 if len(sys.argv) > 6: taskNumber = int(sys.argv[6]) if taskNumber >= totalTasks: taskNumber = totalTasks - 1 numCores = 1 if len(sys.argv) > 7: numCores = int(sys.argv[7]) dumpDir = "./featureDump" if len(sys.argv) > 8: dumpDir = sys.argv[8] SingleFileName = None if len(sys.argv) > 9: SingleFileName = sys.argv[9] saveDir_mapMask = os.path.join(saveDir, jobName + "_MapMasks") saveDir_mask = os.path.join(saveDir, jobName + "_Masks") saveDir_other = os.path.join(saveDir, jobName + "_otherData") try: os.makedirs(saveDir_mapMask) except: None try: os.makedirs(saveDir_mask) except: None try: os.makedirs(saveDir_other) except: None resultsDictionary = {} # Load all rank files (each row contains a probe and its top-N rank results) with open(rank100file) as f: content = f.readlines() content = [x.strip() for x in content] # determine the row indexes of which probe/top N result pairs I want to calculate myPartitionSize = int(len(content) / totalTasks) myPartitionRangeStart = taskNumber * myPartitionSize myPartitionRangeEnd = myPartitionRangeStart + myPartitionSize for c in content: carr = c.split(",") resultsDictionary[carr[0]] = carr[1:] sortedKeys = resultsDictionary.keys() # Sort keys so we know each partition is always working with the same ordered set to partition from sortedKeys = sorted(sortedKeys) # sortedKeys = ["/afs/crc.nd.edu/group/cvrl/scratch_18/medifor/evaluation/NC2017_evaluation_all/world200_9/f7894ef3d96c0767ba23783d66b1e298.jpg"] if numCores > 1: Parallel(n_jobs=4)( delayed(outer_generateMaskForProbe) (p, resultsDictionary, numRanks, saveDir_mask, saveDir_mapMask, saveDir_other) for p in sortedKeys[myPartitionRangeStart:myPartitionRangeEnd]) else: for p in sortedKeys[myPartitionRangeStart:myPartitionRangeEnd]: # if os.path.basename(p) == "bbfb07e272b66a6be65ca87e20908e53.jpg": if os.path.basename( p) == "170303979309eebf5a92c492a84997f6.jpg": outer_generateMaskForProbe(p, resultsDictionary, numRanks, saveDir_mask, saveDir_mapMask, saveDir_other) # for p in sortedKeys: # # if os.path.basename(p) == "8b3c9021c7e6dda308cfe7c594dc79e4.jpg":#"c59a64fb6a8f26cdbc15f3408c43ed26.jpg" or True:#"173e754519ea142944dab8c686efa7b3.jpg": # results = resultsDictionary[p] # finalMapMask, finalMask = genMasksForProbe(p, results,numRanks) # savePath_mask = os.path.join(saveDir_mask, os.path.basename(p)) # savePath_mapmask = os.path.join(saveDir_mapMask, os.path.basename(p)) # cv.imwrite(savePath_mapmask, finalMapMask) # cv.imwrite(savePath_mask, finalMask) else: print( "usage: BuildMasks.py <rankFile> <save Dir> <Number of Ranks=100> <jobname=default> <Total Number of Jobs1=> <Current Job Number=0> <number of cores = 1> <dataDump Directory= ./datadump> " )
def indep_pairwise(X, window_size, step_size, threshold, verbose=True): r"""Determine pair-wise independent variants. Independent variants are defined via squared Pearson correlations between pairs of variants inside a sliding window. Parameters ---------- X : array_like Sample by variants matrix. window_size : int Number of variants inside each window. step_size : int Number of variants the sliding window skips. threshold : float Squared Pearson correlation threshold for independence. verbose : bool `True` for progress information; `False` otherwise. Returns ------- ok : boolean array defining independent variants Examples -------- .. doctest:: >>> from numpy.random import RandomState >>> from limix.qc import indep_pairwise >>> >>> random = RandomState(0) >>> X = random.randn(10, 20) >>> >>> indep_pairwise(X, 4, 2, 0.5, verbose=False) array([ True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]) """ from joblib import Parallel, delayed from tqdm import tqdm from ..threads import get_max_nthreads from numpy import ascontiguousarray, logical_not, zeros left = 0 excls = zeros(X.shape[1], dtype=bool) if step_size > window_size: raise ValueError("Window size has to be smaller than step size.") n = (X.shape[1] + step_size) // step_size steps = list(range(n)) cc = get_max_nthreads() with tqdm(total=n, desc="Indep. pairwise", disable=not verbose) as pbar: while len(steps) > 0: i = 0 right = 0 delayeds = [] while i < len(steps): step = steps[i] left = step * step_size if left < right: i += 1 continue del steps[i] right = min(left + window_size, X.shape[1]) x = ascontiguousarray(X[:, left:right].T) delayeds.append(delayed(_func)(x, excls[left:right], threshold)) if len(delayeds) == cc: Parallel(n_jobs=min(len(delayeds), cc), backend="threading")( delayeds ) pbar.update(len(delayeds)) delayeds = [] if len(delayeds) == 0: continue Parallel(n_jobs=min(len(delayeds), cc), backend="threading")(delayeds) pbar.update(len(delayeds)) return logical_not(excls)
def run_minpath_pipeline(inputfile, mapfile, out_dir, proc=1, regroup_mapfile=None, gap_fill=True, per_sequence_contrib=False, print_cmds=False): '''Pipeline containing full pipeline for reading input files, making calls to functions to run MinPath and calculate pathway abundances and coverages. Will return 3 output Pandas dataframes: (1) unstratified pathway abundances, (2) unstratified pathway coverages, and (3) stratified pathway abundances.''' # Read in table of gene family abundances and determine if in stratified # format or not. in_metagenome, strat_format = read_metagenome_input(inputfile) # Remove 'description' column if it exists. if "description" in in_metagenome.columns: in_metagenome.drop("description", axis=1, inplace=True) # Get list of sample ids. samples = [ col for col in in_metagenome.columns if col not in ["function", "sequence"] ] # Initialize reactions to be empty unless regroup mapfile given. reactions = [] # Regroup functions in input table to different ids if regroup mapfile is # provided. if regroup_mapfile: reactions = read_reaction_names(regroup_mapfile) in_metagenome = regroup_func_ids(in_metagenome, strat_format, regroup_mapfile, proc) regrouped_outfile = path.join(out_dir, "regrouped_infile.tsv") in_metagenome.to_csv(path_or_buf=regrouped_outfile, sep="\t", index=False) # Read in pathway structures. pathways_in = PathwaysDatabase(database=mapfile, reaction_names=reactions) # Write out mapfile with all structure removed. minpath_mapfile = path.join(out_dir, "parsed_mapfile.tsv") with open(minpath_mapfile, "w") as out_map: out_map.write(pathways_in.get_database()) # Subset input table of reactions to only those found in pathway database. in_metagenome = in_metagenome[in_metagenome.function.isin( pathways_in.reaction_list())] # Run minpath wrapper on all samples if table is stratified. Note that # input stratified table is subsetted to required columns only. if strat_format: if per_sequence_contrib: # If running MinPath on each sequence individually then that will be # step parallelized (so each sample will be looped over one-by-one # instead). path_abun_raw = [] for sample_id in samples: path_abun_raw.append( strat_minpath( sample_id, in_metagenome[["function", "sequence", sample_id]], minpath_mapfile, out_dir, pathways_in, gap_fill, per_sequence_contrib, print_cmds, proc)) else: # Parallelize this step if not going to run MinPath for each # sequence individually. path_abun_raw = Parallel(n_jobs=proc)(delayed(strat_minpath)( sample_id, in_metagenome[["function", "sequence", sample_id ]], minpath_mapfile, out_dir, pathways_in, gap_fill, per_sequence_contrib, print_cmds, 1) for sample_id in samples) # Split the output into unstratified and stratified. path_raw_abun_unstrat = [] path_raw_cov_unstrat = [] path_raw_abun_strat = [] path_raw_cov_strat = [] for sample_output in path_abun_raw: path_raw_abun_unstrat += [sample_output[0]] path_raw_cov_unstrat += [sample_output[1]] path_raw_abun_strat += [sample_output[2]] path_raw_cov_strat += [sample_output[3]] # Prep output dfs. path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat) path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat, num_digits=10) path_abun_strat = prep_pathway_df_out(path_raw_abun_strat, strat_index=True) # Also parse stratified coverage table if --per_sequence_contrib set. path_cov_strat = None if per_sequence_contrib: path_cov_strat = prep_pathway_df_out(path_raw_cov_strat, strat_index=True, num_digits=10) path_cov_strat.columns = ["pathway", "sequence"] + samples # Set column labels of unstratified dataframe to be sample names. path_abun_unstrat.columns = samples path_cov_unstrat.columns = samples path_abun_strat.columns = ["pathway", "sequence"] + samples return (path_abun_unstrat, path_cov_unstrat, path_abun_strat, path_cov_strat) # Otherwise the data is in unstratified format, which is more straight- # forward to process. else: path_raw_unstrat = Parallel(n_jobs=proc)(delayed(unstrat_minpath)( sample_id, in_metagenome[["function", sample_id]], minpath_mapfile, out_dir, pathways_in, gap_fill, print_cmds) for sample_id in samples) # Prep output df. path_raw_abun_unstrat = [] path_raw_cov_unstrat = [] for sample_output in path_raw_unstrat: path_raw_abun_unstrat += [sample_output[0]] path_raw_cov_unstrat += [sample_output[1]] path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat) path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat, num_digits=10) # Set column labels of unstratified dataframe to be sample names. path_abun_unstrat.columns = samples path_cov_unstrat.columns = samples return (path_abun_unstrat, path_cov_unstrat, None, None)
def cal_factors(self, start, end, n_jobs): net_profit_Q = self.influx.getDataMultiprocess( 'FinancialReport_Gus', 'net_profit_Q', start, end, ['code', 'net_profit_Q', 'report_period']) net_profit_TTM = self.influx.getDataMultiprocess( 'FinancialReport_Gus', 'net_profit_TTM', start, end, ['code', 'net_profit_TTM', 'report_period']) net_profit_ddt_TTM = self.influx.getDataMultiprocess( 'FinancialReport_Gus', 'net_profit_ddt_TTM', start, end, ['code', 'net_profit_ddt_TTM', 'report_period']) market_cap = self.influx.getDataMultiprocess('DailyFactors_Gus', 'Size', start, end, ['code', 'market_cap']) net_profit_Q.index.names = ['date'] net_profit_Q.reset_index(inplace=True) net_profit_TTM.index.names = ['date'] net_profit_TTM.reset_index(inplace=True) net_profit_ddt_TTM.index.names = ['date'] net_profit_ddt_TTM.reset_index(inplace=True) market_cap.index.names = ['date'] market_cap.reset_index(inplace=True) # ---------------------------------------------------------- EP_Q = pd.merge(net_profit_Q, market_cap, on=['date', 'code']) EP_Q.set_index('date', inplace=True) EP_Q['EP_Q'] = EP_Q['net_profit_Q'] / EP_Q['market_cap'] / 10000 EP_Q = EP_Q.loc[:, ['code', 'EP_Q', 'report_period']] EP_Q = EP_Q.dropna(subset=['EP_Q']) codes = EP_Q['code'].unique() split_codes = np.array_split(codes, n_jobs) with parallel_backend('multiprocessing', n_jobs=n_jobs): res = Parallel()(delayed(influxdbData.JOB_saveData)( EP_Q, 'code', codes, self.db, 'EP_Q') for codes in split_codes) print('EP_Q finish') print('-' * 30) fail_list = [] for r in res: fail_list.extend(r) # ---------------------------------------------------------- # market_cap 的单位为万元 EP = pd.merge(net_profit_TTM, market_cap, on=['date', 'code']) EP.set_index('date', inplace=True) EP['EP_TTM'] = EP['net_profit_TTM'] / EP['market_cap'] / 10000 EP = EP.loc[:, ['code', 'EP_TTM', 'report_period']] EP = EP.dropna(subset=['EP_TTM']) codes = EP['code'].unique() split_codes = np.array_split(codes, n_jobs) with parallel_backend('multiprocessing', n_jobs=n_jobs): res = Parallel()(delayed(influxdbData.JOB_saveData)( EP, 'code', codes, self.db, 'EP') for codes in split_codes) print('EP_TTM finish') print('-' * 30) fail_list = [] for r in res: fail_list.extend(r) # ---------------------------------------------------------- EPcut = pd.merge(net_profit_ddt_TTM, market_cap, on=['date', 'code']) EPcut.set_index('date', inplace=True) EPcut['EPcut_TTM'] = EPcut['net_profit_ddt_TTM'] / EPcut[ 'market_cap'] / 10000 EPcut = EPcut.loc[:, ['code', 'EPcut_TTM', 'report_period']] EPcut = EPcut.dropna(subset=['EPcut_TTM']) codes = EPcut['code'].unique() split_codes = np.array_split(codes, n_jobs) with parallel_backend('multiprocessing', n_jobs=n_jobs): res = Parallel()(delayed(influxdbData.JOB_saveData)( EPcut, 'code', codes, self.db, 'EPcut') for codes in split_codes) print('EPcut_TTM finish') print('-' * 30) for r in res: fail_list.extend(r) return fail_list
def nuscenes_gt_to_kitti( self, lyft_dataroot: str, table_folder: str, lidar_name: str = "LIDAR_TOP", get_all_detections: bool = False, parallel_n_jobs: int = 4, samples_count: Optional[int] = None, ) -> None: """Converts nuScenes GT formatted annotations to KITTI format. Args: lyft_dataroot: folder with tables (json files). table_folder: folder with tables (json files). lidar_name: Name of the lidar sensor. Only one lidar allowed at this moment. get_all_detections: If True, will write all bboxes in PointCloud and use only FrontCamera. parallel_n_jobs: Number of threads to parralel processing. samples_count: Number of samples to convert. """ self.lyft_dataroot = lyft_dataroot self.table_folder = table_folder self.lidar_name = lidar_name self.get_all_detections = get_all_detections self.samples_count = samples_count self.parallel_n_jobs = parallel_n_jobs # Select subset of the data to look at. self.lyft_ds = LyftDataset(self.lyft_dataroot, self.table_folder) self.kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi) self.kitti_to_nu_lidar_inv = self.kitti_to_nu_lidar.inverse # Get assignment of scenes to splits. split_logs = [ self.lyft_ds.get("log", scene["log_token"])["logfile"] for scene in self.lyft_ds.scene ] if self.get_all_detections: self.cams_to_see = ["CAM_FRONT"] else: self.cams_to_see = [ "CAM_FRONT", "CAM_FRONT_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT", ] # Create output folders. self.label_folder = self.store_dir.joinpath("label_2") self.calib_folder = self.store_dir.joinpath("calib") self.image_folder = self.store_dir.joinpath("image_2") self.lidar_folder = self.store_dir.joinpath("velodyne") for folder in [ self.label_folder, self.calib_folder, self.image_folder, self.lidar_folder ]: if not folder.is_dir(): folder.mkdir(parents=True) # Use only the samples from the current split. sample_tokens = self._split_to_samples(split_logs) if self.samples_count is not None: sample_tokens = sample_tokens[:self.samples_count] with parallel_backend("threading", n_jobs=self.parallel_n_jobs): Parallel()(delayed(self.process_token_to_kitti)(sample_token) for sample_token in tqdm(sample_tokens))
def run_stacked(data, stacked_keys, repeat_idx, drop_na): out_scores = pd.DataFrame() out_predictions = data.copy() for key, sel in stacked_keys.items(): this_data = data[sel] if drop_na == 'local': mask = this_data.dropna().index elif drop_na == 'global': mask = data.dropna().index else: mask = this_data.index X = this_data.loc[mask].values y = data['age'].loc[mask].values fold_idx = data.loc[mask]['fold_idx'].values if drop_na is False: # code missings to make the tress learn from it. X_left = X.copy() X_left[this_data.isna().values] = -1000 X_right = X.copy() X_right[this_data.isna().values] = 1000 assert np.sum(np.isnan(X_left)) == 0 assert np.sum(np.isnan(X_right)) == 0 assert np.min(X_left) == -1000 assert np.max(X_right) == 1000 X = np.concatenate([X_left, X_right], axis=1) for column in sel: score = get_mae(data.loc[mask], column) if column not in out_scores: out_scores[column] = score elif out_scores[column].mean() < np.mean(score): out_scores[column] = score unstacked = out_scores[sel].values idx = unstacked.mean(axis=0).argmin() unstacked_mean = unstacked[:, idx].mean() unstacked_std = unstacked[:, idx].std() print(f'{key} | best unstacked MAE: {unstacked_mean} ' f'(+/- {unstacked_std}') print('n =', len(X)) param_grid = {'max_depth': [4, 6, 8, None]} if X.shape[1] > 10: param_grid['max_features'] = (['log2', 'sqrt', None]) reg = GridSearchCV(RandomForestRegressor(n_estimators=1000, random_state=42), param_grid=param_grid, scoring='neg_mean_absolute_error', iid=False, cv=5) if DEBUG: reg = RandomForestRegressor(n_estimators=1000, max_features='log2', max_depth=6, random_state=42) cv = LeaveOneGroupOut() out_cv = Parallel(n_jobs=1)( delayed(fit_predict_score)( estimator=reg, X=X, y=y, train=train, test=test, test_index=this_data.loc[mask].index[test]) for train, test in cv.split(X, y, fold_idx)) out_cv = zip(*out_cv) predictions = next(out_cv) out_predictions[f'stacked_{key}'] = np.nan for pred in predictions: assert np.all(out_predictions.loc[pred.index]['age'] == pred['y']) out_predictions.loc[pred.index, f'stacked_{key}'] = pred['prediction'].values scores = np.array(next(out_cv)) print(f'{key} | MAE : %0.3f (+/- %0.3f)' % (np.mean(scores), np.std(scores))) out_scores[key] = scores out_scores['repeat_idx'] = repeat_idx out_predictions['repeat_idx'] = repeat_idx return out_scores, out_predictions
def fit(self, X, y, sample_weight=None): """Fit the estimators. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) Target values. sample_weight : array-like of shape (n_samples,) or default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. .. versionchanged:: 0.23 when not None, `sample_weight` is passed to all underlying estimators Returns ------- self : object """ # all_estimators contains all estimators, the one to be fitted and the # 'drop' string. names, all_estimators = self._validate_estimators() self._validate_final_estimator() stack_method = [self.stack_method] * len(all_estimators) # Fit the base estimators on the whole training data. Those # base estimators will be used in transform, predict, and # predict_proba. They are exposed publicly. self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_single_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop') self.n_features_in_ = self.estimators_[0].n_features_in_ self.named_estimators_ = Bunch() est_fitted_idx = 0 for name_est, org_est in zip(names, all_estimators): if org_est != 'drop': self.named_estimators_[name_est] = self.estimators_[ est_fitted_idx] est_fitted_idx += 1 else: self.named_estimators_[name_est] = 'drop' # To train the meta-classifier using the most data as possible, we use # a cross-validation to obtain the output of the stacked estimators. # To ensure that the data provided to each estimator are the same, we # need to set the random state of the cv if there is one and we need to # take a copy. cv = check_cv(self.cv, y=y, classifier=is_classifier(self)) if hasattr(cv, 'random_state') and cv.random_state is None: cv.random_state = np.random.RandomState() self.stack_method_ = [ self._method_name(name, est, meth) for name, est, meth in zip(names, all_estimators, stack_method) ] fit_params = ({ "sample_weight": sample_weight } if sample_weight is not None else None) predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, fit_params=fit_params, verbose=self.verbose) for est, meth in zip(all_estimators, self.stack_method_) if est != 'drop') # Only not None or not 'drop' estimators will be used in transform. # Remove the None from the method as well. self.stack_method_ = [ meth for (meth, est) in zip(self.stack_method_, all_estimators) if est != 'drop' ] X_meta = self._concatenate_predictions(X, predictions) _fit_single_estimator(self.final_estimator_, X_meta, y, sample_weight=sample_weight) return self
def main(): args = get_args() helper.print_script_args_and_info(args) Parallel(n_jobs=args.n_jobs)( delayed(process_graph_cache_file)(graph_cache_file, args) for graph_cache_file in dataset_helper.get_all_cached_graph_datasets())
return kernel.evaluate(np.vstack([values[0][j], values[1][j]]))[0]*fator def recuperaArrayPDF(kernel, values, estados): lst = {} for j in range(tam): PDF = kernel.evaluate(np.vstack([values[0][j], values[1][j]]))[0]*fator nomeEstado = str(estados[j]) try: lst[nomeEstado] = (PDF+lst[nomeEstado]) except KeyError: lst[nomeEstado] = PDF return lst if allowParallelKDEProcessing: #Paralelo if tam < 1000: numThreads = 1 else: numThreads = tam/1000 CDFs = Parallel(n_jobs=numThreads, backend="threading")(delayed(recuperaArrayPDFParalelo)(j) for j in limite) else: #Sequencial CDFs = recuperaArrayPDF(kernel, values, uf) if allowRedisCaching: #grava a nova consulta no redis (cache) redis.set(chave, CDFs) #resultado enviado print CDFs
temp += [m[i]] if average_type == 'mean': temp = scipy.sparse.vstack(temp).mean(axis=0) elif average_type == 'gmean': temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0) temp[temp < 1e-6] = 0 return file_to_md5[image_name], csr_matrix(temp) result_path = Path('data') / 'prediction' / 'global' result_path.mkdir(exist_ok=True, parents=True) result = Parallel(n_jobs=12)(delayed(get_probs)(i) for i in file_names.index) # # result = [get_probs(i) for i in tqdm(file_names.index)] print('[{}] Unzippping...'.format(str(datetime.datetime.now()))) pred_md5_list, probs = zip(*result) probs = vstack(probs) labels = pd.DataFrame({'md5': pred_md5_list}) print('[{}] Saving labels...'.format(str(datetime.datetime.now()))) labels.to_csv(str(result_path / (average_type + '_last_md5_list.csv')), index=False)
def fit(self, modality, ground_truth=None, cat=None): """Compute the images images. Parameters ---------- modality : object of type TemporalModality The modality object of interest. ground-truth : object of type GTModality or None The ground-truth of GTModality. If None, the whole data will be considered. cat : str or None String corresponding at the ground-truth of interest. Cannot be None if ground-truth is not None. Return ------ self : object Return self. """ super(HaralickExtraction, self).fit(modality=modality, ground_truth=ground_truth, cat=cat) # Get the data and rescale as integers within the given levels vol_haralick = ((modality.data_ - np.ndarray.min(modality.data_)) * ((self.levels -1) / (np.ndarray.max(modality.data_) - np.ndarray.min(modality.data_)))).astype(int) # Extract the set of patches from the modality data patches = extract_patches(vol_haralick, patch_shape=self.patch_size) # Allocate the haralick maps, one for each feature that # will be computed nb_directions = 13 nb_features = 13 self.data_ = np.zeros((modality.data_.shape[0], modality.data_.shape[1], modality.data_.shape[2], nb_directions, nb_features)) # WE NEED TO PARALLELIZE THIS CODE # # Extract Haralick feature for each patch # # Define the shift to apply if isinstance(self.patch_size, tuple): y_shift = int(np.ceil((self.patch_size[0] - 1) / 2.)) x_shift = int(np.ceil((self.patch_size[1] - 1) / 2.)) z_shift = int(np.ceil((self.patch_size[2] - 1) / 2.)) elif isinstance(self.patch_size, int): y_shift = int(np.ceil((self.patch_size - 1) / 2.)) x_shift = int(np.ceil((self.patch_size - 1) / 2.)) z_shift = int(np.ceil((self.patch_size - 1) / 2.)) # for y in range(patches.shape[0]): # for x in range(patches.shape[1]): # for z in range(patches.shape[2]): # print 'Compute for the pixel at position {}{}{}'.format( # y, x, z) # # Compute the haralick features # self.data_[y + y_shift, # x + x_shift, # z + z_shift, :] = haralick( # patches[y, x, z, :], # distance=self.distance) # Create the list of indices to process yy, xx, zz = np.meshgrid(range(patches.shape[0]), range(patches.shape[1]), range(patches.shape[2])) # Linearize for fast processing yy = yy.reshape(-1) xx = xx.reshape(-1) zz = zz.reshape(-1) # Go for the parallel loop haralick_features = Parallel(n_jobs=-1)(delayed( _compute_haralick_features)(patches[y, x, z, :], self.distance) for y, x, z in zip(yy, xx, zz)) # Convert to numpy array haralick_features = np.array(haralick_features) # Reshape the feature matrix haralick_features = haralick_features.reshape((patches.shape[0], patches.shape[1], patches.shape[2], nb_directions, nb_features)) # Copy the feature into the object self.data_[y_shift : -y_shift, x_shift : -x_shift, z_shift : -z_shift] = haralick_features return self
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0): """ GLM fit for an fMRI data matrix Parameters ---------- Y : array of shape (n_time_points, n_voxels) The fMRI data. X : array of shape (n_time_points, n_regressors) The design matrix. noise_model : {'ar1', 'ols'}, optional The temporal variance model. Defaults to 'ar1'. bins : int, optional Maximum number of discrete bins for the AR(1) coef histogram. n_jobs : int, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : int, optional The verbosity level. Defaut is 0 Returns ------- labels : array of shape (n_voxels,), A map of values on voxels used to identify the corresponding model. results : dict, Keys correspond to the different labels values values are RegressionResults instances corresponding to the voxels. """ acceptable_noise_models = ['ar1', 'ols'] if noise_model not in acceptable_noise_models: raise ValueError( "Acceptable noise models are {0}. You provided " "'noise_model={1}'".format(acceptable_noise_models, noise_model) ) if Y.shape[0] != X.shape[0]: raise ValueError('The number of rows of Y ' 'should match the number of rows of X.' ' You provided X with shape {0} ' 'and Y with shape {1}'. format(X.shape, Y.shape)) # Create the model ols_result = OLSModel(X).fit(Y) if noise_model == 'ar1': # compute and discretize the AR1 coefs ar1 = ( (ols_result.residuals[1:] * ols_result.residuals[:-1]).sum(axis=0) / (ols_result.residuals ** 2).sum(axis=0) ) del ols_result ar1 = (ar1 * bins).astype(np.int) * 1. / bins # Fit the AR model acccording to current AR(1) estimates results = {} labels = ar1 # Parallelize by creating a job per ARModel vals = np.unique(ar1) ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_ar_model_fit)(X, val, Y[:, labels == val]) for val in vals) for val, result in zip(vals, ar_result): results[val] = result del vals del ar_result else: labels = np.zeros(Y.shape[1]) results = {0.0: ols_result} return labels, results
clf = SVC(C=c, kernel='precomputed') clf.fit(X_features_trainm, Y[train_indices]) erfsvm.append(clf.score(X_features_testm, Y[test_indices])) testfile.write("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2), floored_percentage(np.std(erfsvm), 2)) + '\n') testfile.write("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2), floored_percentage(np.std(e8), 2)) + '\n') testfile.write(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2), floored_percentage(np.std(elaterf), 2)) + '\n') testfile.write(" LATERFDIS&%s pm%s & " % (floored_percentage(np.mean(elaterfdis), 2), floored_percentage(np.std(elaterfdis), 2)) + '\n') print(ss) print("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2), floored_percentage(np.std(erfsvm), 2)) + '\n') print("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2), floored_percentage(np.std(e8), 2)) + '\n') print(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2), floored_percentage(np.std(elaterf), 2)) + '\n') print(" LATERFDIS&%s pm%s & " % (floored_percentage(np.mean(elaterfdis), 2), floored_percentage(np.std(elaterfdis), 2)) + '\n') if __name__ == '__main__': Parallel(n_jobs=4)(delayed(mcode)(ite=i) for i in range(4))
# Read analysis info file and cd ai=functions.read_analysis_info_file(args.analysis_info_file) os.chdir(ai['project_location']) #Ncores ncores=ai['ncores'] # Read sample names text file sample_names_file=args.sample_names_file sampleNames = functions.read_sample_names(sample_names_file) # Set input and output directories if not 'rawReads/' in_dir= args.in_dir out_dir_report= args.out_dir_report readType=ai['readType'] suffix_name=args.suffix_name # Create tables files=functions.get_filepaths(in_dir) files = [files[y] for y, x in enumerate(files) if re.findall("fastqc_data.txt", x)] Parallel(n_jobs=8)(delayed(tables)(i) for i in files) print "Got data from fastqc output... \n" # Create plots functions.make_sure_path_exists(out_dir_report) Parallel(n_jobs=8)(delayed(plots)(i) for i in sampleNames) print "Made plots per sample... \n" os.system('/usr/bin/Rscript bin/create_fastqcPlots_allSamples.R ' + in_dir + ' ' + sample_names_file + ' ' + readType + ' ' + out_dir_report + ' ' + suffix_name + ' ' + args.plot_device) print "Made plots all samples... \n"
def aprun(**tq_args): tqdm_f = lambda x, args: tqdm(x, **args) return lambda x: Parallel(**joblib_args)(tqdm_f(x, tq_args))
# Do we filter by a base? if base=="None": gene_sets = gsea.groupby(["Gene_set"]).size().reset_index(name="count") print(gene_sets) gene_sets = gene_sets[gene_sets["count"] >= 100] #Changed back to 100!!! print(gene_sets) # gene_sets = gene_sets[gene_sets["count"] <= 150] # print(gene_sets) gene_sets = [i for i in itertools.combinations(list(gene_sets["Gene_set"]),2)] if both=="T": print("both one-tailed used") gene_r = [(i[1], i[0]) for i in gene_sets] gene_sets = gene_sets + gene_r else: base = pd.read_csv(in_folder + "GSEA_FILES/" + gsea_type + "_gsea_" + base + "_both_" + both + "_pvals", sep="\t") gene_sets = list(set(base["gs"])) gene_sets = [(i.split("$")[0], i.split("$")[1]) for i in gene_sets] print(gsea_type, len(gene_sets)) main_dict = Parallel(n_jobs=40)(delayed(mann_pval)(i) for i in gene_sets) print("Done calculating") # Write to file main_dict = pd.concat([pd.DataFrame(i) for i in main_dict]) file_out = in_folder + "GSEA_FILES/" + gsea_type + "_gsea_"+ exp_type + "_both_" + both + "_ext_gmv_" + ext_gmv + "_pvals" main_dict.to_csv(file_out, sep="\t", header=True, index=False) print ("Done writing")
first_page=page_number, data_dir="posts", verbose=verbose ) for page_number in page_range(last_page=last_page, first_page=first_page) ) if __name__ == "__main__": # save_posts(last_page=190, first_page=190, data_dir="posts") # save_posts_fake(last_page=190, first_page=190, data_dir="posts") #parallel_save_post(last_page=190, data_dir="posts", verbose=1) #soup = fetch_post_as_soup(13304) #soup_article = soup.find("article") #html = post_content_from_soup(soup_article).prettify() #save_content(html, "test3.html") from joblib import Parallel, delayed last_page=195 r = Parallel(n_jobs=10, verbose=10)( delayed(save_posts)( last_page=page_number, first_page=page_number, data_dir="posts" ) for page_number in page_range(last_page=last_page) )
filled['cc_{}'.format(sex)] = filled['ccpublic_{}'.format(sex)] + filled['ccprivate_{}'.format(sex)] filled['crime_{}'.format(sex)] = filled['crimepublic_{}'.format(sex)] + filled['crimeprivate_{}'.format(sex)] filled['health_{}'.format(sex)] = filled['health_private_{}'.format(sex)] + filled['health_public_{}'.format(sex)] filled['transfer_{}'.format(sex)] = filled['inc_trans_pub_{}'.format(sex)] + filled['diclaim_{}'.format(sex)] + filled['ssclaim_{}'.format(sex)] + filled['ssiclaim_{}'.format(sex)] components = ['inc_labor', 'inc_parent', 'transfer', 'edu', 'crime', 'costs', 'cc', 'health', 'qaly', 'm_ed'] factors = np.arange(0,3.1,0.25) combo = list(itertools.product(components, factors)) # vary factor: IRR # applying factor to benefits def irr_factors(part, f): irr_tmp = deepcopy(filled) for sex in ['m', 'f', 'p']: irr_tmp['{}_{}'.format(part, sex)] = irr_tmp['{}_{}'.format(part, sex)] * f output = irr_calc(irr_tmp, etype=etype, components=components) output['rate'] = f output['part'] = part print 'IRR for {} and factor {} calculated.'.format(part, f) return output irr_factors = Parallel(n_jobs=25)( delayed(irr_factors)(part, f) for part, f in combo) irr_factors = pd.concat(irr_factors, axis=0) irr_factors.sort_index(inplace=True) irr_factors.to_csv(os.path.join(plots, 'irr_factors.csv'), index=True)
def fit(self, x, grp=[], center=False, combine=False, grpas='******', grplen=[], display=True, n_jobs=-1): """Run the model on the matrix of features x Args: x: array-like The features. Dimension [n trials x n features] Kargs: grp: list of strings, optional, [def: []] Group features by using a list of strings. The length of grp must be the same as the number of features. If grp is not empty, the program will run the feature selection inside each group. center: optional, bool, [def: False] Normalize fatures with a zero mean by substracting then dividing by the mean. The center parameter should be set to True if the classifier is a svm. combine: boolean, optional, [def: False] If a group of features is specified using the grp parameter, combine give the access of combining or not groups. For example, if there is three unique groups, combining them will compute the mf model on each combination : [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]] grpas: string, optional, [def: 'single'] Specify how to consider features inside each group. If the parameter grpas ("group as") is: * 'single': inside each combination of group, the features are considered as independant. * 'group': inside each combination of group, the features are going to be associated. So the mf model will search to add a one by one feature, but it will add groups of features. grplen: list, optional, [def: []] Control the number of combinations by specifying the number of elements to associate. If there is three unique groups, all possible combinations are : [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]] but if grplen is specify, for example grplen=[1,3], this will consider combinations of groups only with a length of 1 and 3 and remove combinations of 2 elements: [[1],[2],[3],[1,2,3]] display: boolean, optional, [def: True] Display informations for each step of the mf selection. If n_jobs is -1, it is advise to set the display to False. n_jobs: integer, optional, [def: -1] Control the number of jobs to cumpute the decoding accuracy. If n_jobs=-1, all the jobs are used. Returns: da: list The decoding accuracy (da) for each group with the selected number of repetitions, which by default is set to 10 (see : cvOut // rep) prob: list The appearance probability of each feature. The size of prob is the same as da. groupinfo: pandas Dataframe Dataframe to resume the mf feature selection. """ # - Check and get elements sizes: y = self._y if x.shape[0] != len(y): x = x.T y = np.ravel(y) ntrial, nfeat = x.shape # Normalize features : if center: x_m = np.tile(np.mean(x, 0), (x.shape[0], 1)) x = (x - x_m) / x_m # Combine groups : grp_c = combineGroups(grp, nfeat, combine, grpas=grpas, grplen=grplen) grp_name, grp_idx = list(grp_c['name']), list(grp_c['idx']) ngrp = len(grp_name) # - Run the MF model for each combinaition: mfdata = Parallel(n_jobs=n_jobs)( delayed(_fit)(x, y, grp_c, k, combine, display, self) for k in range(len(grp_c))) # Get data & complete the Dataframe : da, prob, MFstr = zip(*mfdata) self.MFstr = MFstr[-1] grp_c['da'], grp_c['occurrence'] = [sum(k) / len(k) for k in da], prob return da, prob, grp_c
parser.add_argument("-d", "--dstdir", type=str, help="dst image folder") parser.add_argument("-n", "--n_jobs", type=int, default=2, help="parallel jobs") parser.add_argument("-p", "--parallel", action='store_true', default=False, help="if parallel") args = parser.parse_args() filelist = os.listdir(args.srcdir) def conv_npz_nrrd(filename): print(filename) sys.stdout.flush() npzpath = os.path.join(args.srcdir, filename) filebase = os.path.splitext(filename)[0] nrrdbasename = filebase + '.nrrd' nrrdpath = os.path.join(args.dstdir, nrrdbasename) npz = np.load(npzpath) nparr = npz['arr_0'] print(np.unique(nparr)) nparr = np.where(nparr == 0, 0, 1).astype(np.uint8) print(np.unique(nparr)) nrrd.write(nrrdpath, nparr) if args.parallel: Parallel(n_jobs=args.n_jobs, backend="multiprocessing")( delayed(conv_npz_nrrd)(filename) for filename in filelist) else: for filename in filelist: conv_npz_nrrd(filename)
def compute_train_features(data, ts_uid_columns, time_features, lags, window_functions, ignore_const_cols=True, n_jobs=1): """ Parameters ---------- data : pd.DataFrame Dataframe with (at least) columns: 'ds' and 'y'. ts_uid_columns: list List of columns names that are unique identifiers for time series. time_features: list Time attributes to include as features. lags: list List of integer lag values to include as features. window_functions: list List with the definition of the rolling window functions to compute. ignore_const_cols: bool Specify whether to ignore constant columns. n_jobs: int Number of jobs to run in parallel when computing the lag/rw features. Returns ---------- all_features: pd.Dataframe Dataframe containing all the features for the time series. """ # list with all the dataframes of features all_features_list = list() all_features_list.append(data.reset_index(drop=True)) # generating the time features if len(time_features) > 0: input_params = { "date_range": pd.DatetimeIndex(data.ds), "time_features": time_features, "ignore_const_cols": ignore_const_cols } calendar_features = compute_calendar_features(**input_params) all_features_list.append(calendar_features) # generating the lag & rolling window features if (len(lags) > 0) or (len(window_functions) > 0): lag_kwargs = [{"lag": lag} for lag in lags] rw_kwargs = [{ "func_name": window_func[0], "func_call": window_func[1], "window_shift": window_func[2], "window_size": window_func[3] } for window_func in window_functions] input_kwargs = lag_kwargs + rw_kwargs grouped = data.loc[:, ts_uid_columns + ["y"]].groupby(ts_uid_columns)["y"] with Parallel(n_jobs=n_jobs) as parallel: delayed_func = delayed(compute_lagged_train_feature) lagged_features = parallel( delayed_func(grouped, **kwargs) for kwargs in input_kwargs) lagged_features = pd.DataFrame( {feature.name: feature.values for feature in lagged_features}) all_features_list.append(lagged_features) # merging all features all_features = pd.concat(all_features_list, axis=1) all_features.set_index(data.index, inplace=True) return all_features
ch_p, ch_r, lytf, lypf = 0, 0, 0, 0 for ytf, ypf in zip(ex.steps_annotation, steps): ch_p += count_hit(ytf, ypf) ch_r += count_hit(ypf, ytf) lytf += len(ytf) lypf += len(ypf) if lytf == 0: print('Warning: No steps annotation for', ex.id) ch_r, lytf = 0, 1 if lypf == 0: ch_p, lypf = 0, 1 return [ch_r/lytf, ch_p/lypf] del_PR = delayed(PR) PRS = Parallel(n_jobs=-2)(del_PR(ex) for ex in range(N)) PRS = np.array([prs for prs in PRS if prs is not None]) t_batch = time()-t_start score = list(PRS.mean(axis=0)) score += list(PRS.std(axis=0)) res['results'] += [score] res['patterns'] += [patterns] results += [res] t_batch = time() - t_start print('-'*79) print('Batch : {:03}/{:03}'.format(simu, n_batch)) print('Time batch : {:.2f}s'.format(t_batch)) print('Train: {}, Test: {}'.format(c_train[0], c_test[0])) print('Score: {0:.2f}({2:.2f}), {1:.2f} ({3:.2f})'
if args.metrics: data.GenMetrics(msFile[:-4] + '_metrics.txt') print('\nDone processing ' + msFile + '!\n') data.Close() else: num_cores = multiprocessing.cpu_count() if int(args.parallel) <= num_cores: num_cores = int(args.parallel) elif int(args.parallel) > num_cores: # if user asks for more cores than exist, default to the maximum print( 'Specified number of cores for parallelization exceeds ' + 'available number of cores. Maximum will be used.') Parallel(n_jobs=num_cores)( delayed(func)(msFile=msFile, reagents=reagents, mgf=args.generate_mgf, interference=args.quantify_interference, impurities=impurities, metrics=args.metrics, boxcar=args.boxcar, isolationOffset=args.isolation_window_offset) for msFile in files)