def __init__(self):
		global transport 

		variables = ['meridional_transport','psi']
		num_cores = 6

		data = np.ones((len(variables),len(scow.months),scow.latitude.shape[0],scow.longitude.shape[0]))*np.nan

		beta = c.beta.repeat(scow.longitude.shape[0]).reshape((c.beta.shape[0],scow.longitude.shape[0]))

		for i in xrange(scow.data.shape[1]):
			transport = scow.data[2,i,:,:]/beta

			psi = Parallel(n_jobs=num_cores)(delayed(integration)(lat) for lat in scow.latitude)
			psi = np.array(psi)

			D = np.array([transport.copy()/(c.rho*1.e+6),psi.copy()/(c.rho*1.e+6)])
			data[:,i,:,:] = D			

		del transport 

		# Here I could derive psi in y and get the zonal sverdrup transport (I think I won't need it)


		# "Isolating" the subtropical gyre 
		ibad = (np.abs(scow.latitude) <= 5) | (np.abs(scow.latitude) >= 50)
		data[:,:,ibad,:] = np.nan

		self.latitude = scow.latitude
		self.longitude = scow.longitude
		self.variables = variables
		self.data = data
	def __init__(self):
		global transport 

		variables = ['zonal_transport','meridional_transport','psi']
		num_cores = 6

		data = np.ones((len(variables),len(scow.months),scow.latitude.shape[0],scow.longitude.shape[0]))*np.nan

		f = c.f.repeat(scow.longitude.shape[0]).reshape((c.f.shape[0],scow.longitude.shape[0]))

		for i in xrange(scow.data.shape[1]):
			zonal_transport = scow.data[1,i,:,:]/(f*c.rho)
			meridional_transport = -scow.data[0,i,:,:]/(f*c.rho)

			transport = meridional_transport.copy()

			psi = Parallel(n_jobs=num_cores)(delayed(integration)(lat) for lat in scow.latitude)
			psi = np.array(psi)

			D = np.array([zonal_transport.copy()/1.e+6,meridional_transport.copy()/1.e+6,psi.copy()/1.e+6])
			data[:,i,:,:] = D			

		del transport

		# "Isolating" the subtropical gyre 
		ibad = (np.abs(scow.latitude) <= 5) | (np.abs(scow.latitude) >= 50)
		data[:,:,ibad,:] = np.nan

		self.latitude = scow.latitude
		self.longitude = scow.longitude
		self.variables = variables
		self.data = data
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(description='Register & align images')
    parser.add_argument('filenames',nargs='+',help='List of target files to register. Images are aligned to first in list.')
    parser.add_argument('-odir',metavar='outdir',required=True,type=str,help='Output directory for files.')
    parser.add_argument('-m',metavar='method',choices=('point','extended'),default='extended',help='Specify alignment method (point or extended); default=extended.')
    parser.add_argument('-xy',nargs=2,type=float,default=None,help='Specify approximate "x y" pixel coordinate of object to centroid on.  Required for point mode; useful for extended mode (default=center of image).')
    parser.add_argument('-box',nargs=2,type=int,default=None,help='Specify box size (w h) to restrict alignment search.  Useful for both point & extended modes (default=full size of array).')
    parser.add_argument('--c',action='store_true',help='Clobber (overwrite) on output')
    parser.add_argument('-njobs',type=int,default=1,help='Process images in parallel. "-1" is all CPUs (default=1).')
    
    args = parser.parse_args()

    if args.m == 'point' and args.xy is None:
        parser.error("-m point requires -xy coordinate")

    # create output directory
    if args.odir not in ['','.']:
        makedirs(args.odir,exist_ok=True)

    # align all images to first filename
    ref = args.filenames[0]
    align = args.filenames[1:]

    imref = partial(register,ref=ref,outdir=args.odir,
                    method=args.m,center=args.xy,size=args.box,
                    overwrite=args.c)
    
    outfiles = Parallel(n_jobs=args.njobs,verbose=11)(delayed(imref)(toshift=a) for a in align)

    # Write ref to outdir
    refnew = os.path.join(args.odir,os.path.basename(ref))
    copy(ref,refnew)

    outfiles.append(refnew)
    print('Wrote %i files to %s' % (len(outfiles), args.odir))
def auto_choose(actionfile, new_xyz, nparallel=-1):
    """
    @param demofile: h5py.File object
    @param new_xyz : new rope point-cloud
    @nparallel     : number of parallel jobs to run for tps cost calculaion.
                     If -1 only 1 job is used (no parallelization).
    
    @return          : return the name of the segment with the lowest warping cost.
    """
    if not nparallel == -1:
        from joblib import Parallel, delayed
        nparallel = min(nparallel, 8)

    demo_data = actionfile.items()

    if nparallel != -1:
        before = time.time()
        redprint("auto choose parallel with njobs = %d"%nparallel)
        costs  = Parallel(n_jobs=nparallel, verbose=0)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data)
        after  = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        redprint("auto choose sequential..")
        for i, ddata in enumerate(demo_data):
            costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz))
            print(("tps-cost completed %i/%i" % (i + 1, len(demo_data))))

    ibest = np.argmin(costs)
    redprint ("auto choose returning..")
    return demo_data[ibest][0]
Exemple #5
0
def load_glm_inputs(study_dirs, hrf_model='canonical', drift_model='cosine',
                    img_ext='nii.gz', memory=Memory(None), n_jobs=1):
    """Returns data (almost) ready to be used for a GLM.
    """
    datasets, structural, functional, conditions, contrasts = \
        collect_openfmri(study_dirs, img_ext=img_ext, memory=memory, n_jobs=n_jobs)

    main = functional.merge(conditions)

    # computing design matrices
    print 'Computing models...'
    results = Parallel(n_jobs=n_jobs, pre_dispatch='n_jobs')(
        delayed(memory.cache(_make_design_matrix))(
            run_df, hrf_model, drift_model, orthogonalize=datasets[group_id[0]]['models'][group_id[2]]['orthogonalize'])
        for group_id, group_df in main.groupby(['study', 'subject', 'model'])
        for run_id, run_df in group_df.groupby(['task', 'run'])
        )

    # collect results
    print 'Collecting...'
    glm_inputs = {}
    for group_id, group_df in main.groupby(['study', 'subject', 'model']):
        study_id, subject_id, model_id = group_id
        for session_id, run_df in group_df.groupby(['task', 'run']):
            task_id, run_id = session_id
            bold_file, dm = results.pop(0)        
            glm_inputs.setdefault(group_id, {}).setdefault('bold', []).append(bold_file)
            glm_inputs.setdefault(group_id, {}).setdefault('design', []).append(dm)
        glm_inputs.setdefault(group_id, {}).setdefault(
            model_id, _make_contrasts(datasets, study_id, model_id, hrf_model, group_df))
        glm_inputs.setdefault(group_id, {}).setdefault(
            '%s_per_run' % model_id, _make_contrasts(
                datasets, study_id, model_id, hrf_model, group_df, per_run=True))
    return glm_inputs
def find_closest_auto(demofile, new_xyz):
    if args.parallel:
        from joblib import Parallel, delayed
    demo_clouds = [asarray(seg["cloud_xyz"]) for seg in demofile.values()]
    keys = demofile.keys()
    if args.parallel:
        costs = Parallel(n_jobs=3,verbose=100)(delayed(registration_cost)(demo_cloud, new_xyz) for demo_cloud in demo_clouds)
    else:
        costs = []
        for (i,ds_cloud) in enumerate(demo_clouds):
            costs.append(registration_cost(ds_cloud, new_xyz))
            print "completed %i/%i"%(i+1, len(demo_clouds))
    
    print "costs\n",costs
    if args.show_neighbors:
        nshow = min(5, len(keys))
        import cv2, rapprentice.cv_plot_utils as cpu
        sortinds = np.argsort(costs)[:nshow]
        near_rgbs = [asarray(demofile[keys[i]]["rgb"]) for i in sortinds]
        bigimg = cpu.tile_images(near_rgbs, 1, nshow)
        cv2.imshow("neighbors", bigimg)
        print "press any key to continue"
        cv2.waitKey()
        
    ibest = np.argmin(costs)
    return keys[ibest]
Exemple #7
0
    def _update_filters(self, X):
        if self.verbose:
            last_score = self._bound(X)
            start_t = time.time()

        U = Parallel(n_jobs=self.n_jobs)(
            delayed(global_update_U)(
                X[:, j], self.U[:, j], self.gamma[j], self.alpha,
                self.nu, self.rho, self.EA, self.ElogA, self.verbose
            )
            for j in xrange(self.n_feats)
        )
        U = np.vstack(U).T
        self.U = U.copy()
        if self.verbose:
            score = self._bound(X)
            print_increment('U', last_score, score)
            last_score = score

        self._update_gamma(X)
        if self.verbose:
            score = self._bound(X)
            print_increment('gamma', last_score, score)
            last_score = score

        self._update_alpha(X)
        if self.verbose:
            score = self._bound(X)
            print_increment('alpha', last_score, score)

        if self.verbose:
            t = time.time() - start_t
            print('Update free parameters\ttime: %.2f' % t)
    def analysis(self, permute=False):
        """
        Classify based an iteratively increasing the number of features (electrodes) included in the model. Starts with
        the single best electrode (N=1) and increase until N = the number of electrodes.

        Note: permute is not used in this analysis, but kept to match the same signature as super.
        """
        if self.subject_data is None:
            print('%s: compute or load data first with .load_data()!' % self.subject)

        # Get recalled or not labels
        if self.recall_filter_func is None:
            print('%s classifier: please provide a .recall_filter_func function.' % self.subject)
        y = self.recall_filter_func(self.subject_data)

        # zscore the data by session
        x = self.zscore_data()

        # create the classifier
        classifier = LogisticRegression(C=self.C, penalty=self.norm, solver='liblinear')

        # create .num_rand_splits of cv_dicts
        cv_dicts = [self._make_cross_val_labels() for _ in range(self.num_rand_splits)]

        # run permutations with joblib
        f = _par_compute_and_run_split
        if self.use_joblib:
            aucs = Parallel(n_jobs=12, verbose=5)(delayed(f)(cv, classifier, x, y) for cv in cv_dicts)
        else:
            aucs = []
            for cv in tqdm(cv_dicts):
                aucs.append(f(cv, classifier, x, y))

        # store results
        self.res['auc_x_n'] = np.stack(aucs)
Exemple #9
0
def preprocess(file_in, file_out, test=False, n_jobs=6):
    """
    This function preprocesses raw data file.
    For each row and for each feature it extracts aggregations over TimeToEnd:
        From feature TimeToEnd it extracts total time ("time") and number of observations ("n_obs")
        From feature DistanceToRadar it extracts aggregations ('min', '50% quantile', 'mean', 'max')
        For any other features it calculates ('mean', 'std', 'min', '50% quantile', 'max')

        New features names follow the pattern: <feature name>_<aggregation function>

    Parameters
    ----------
    :param file_in: str
        csv-file name for data to be preprocessed
    :param file_out: str
        csv-file name for output data
    :param test: bool
        indicator for test data (data without label)
    :return:
    """
    # Load data to pandas.DataFrame
    data_raw = pd.read_csv(file_in, na_filter=False, chunksize=5000)

    # Apply transformations to data chunks in parallel
    start = time.time()
    data = Parallel(n_jobs=n_jobs, verbose=11)(delayed(foo)(x, transform, axis=1, test=test) for i, x in enumerate(data_raw))
    print "Preprocessing time: ", round((time.time() - start) / 60, 3)
    print "Records: ", len(data)

    # Join data chunks and save result to csv
    data = pd.concat(data)
    data.to_csv(file_out, index=False)

    print "File", file_in, "preprocessed to", file_out
Exemple #10
0
def best_classifier(X,Y,Xvs,Yvs):
    parameters = {'C':[3,13,67,330,1636,8103]}
    pg = ParameterGrid(parameters)
    clas = Parallel(n_jobs=4)(delayed(pfit)(p,X,Y,Xvs,Yvs) for p in pg)
    clas.sort(reverse=True)
    (sc,cla) = clas[0]
    print '-'*20
    print 'best is ',cla,sc
    print '-'*20
    return cla,sc
Exemple #11
0
def retrieve_proposals(video_info, model, feature_filename,
                       feat_size=16, stride_intersection=0.1):
    """Retrieve proposals for a given video.
    
    Parameters
    ----------
    video_info : DataFrame
        DataFrame containing the 'video-name' and 'video-frames'.
    model : dict
        Dictionary containing the learned model.
        Keys: 
            'D': 2darray containing the sparse dictionary.
            'cost': Cost function at the last iteration.
            'durations': 1darray containing typical durations (n-frames)
                 in the training set.
            'type': Dictionary type.
    feature_filename : str
        String containing the path to the HDF5 file containing 
        the features for each video. The HDF5 file must contain 
        a group for each video where the id of the group is the name 
        of the video; and each group must contain a dataset containing
        the features.
    feat_size : int, optional
        Size of the temporal extension of the features.
    stride_intersection : float, optional
         Percentage of intersection between temporal windows.
    """
    feat_obj = FeatHelper(feature_filename, t_stride=1)
    candidate_df = generate_candidate_proposals(video_info, model['durations'],
                                                feat_size, stride_intersection)
    D = model['D']
    params = model['params']
    feat_obj.open_instance()
    feat_stack = feat_obj.read_feat(video_info['video-name'])
    feat_obj.close_instance()
    n_feats = feat_stack.shape[0]
    candidate_df = candidate_df[
        (candidate_df['f-init'] + candidate_df['n-frames']) <= n_feats]
    candidate_df = candidate_df.reset_index(drop=True)
    proposal_df = Parallel(n_jobs=-1)(delayed(wrapper_score_proposals)(this_df,
                                                                      D, 
                                                                     feat_stack,
                                                                       params,
                                                                     feat_size)
                                      for k, this_df in candidate_df.iterrows())
    proposal_df = pd.concat(proposal_df, axis=1).T
    proposal_df['score'] = (
        proposal_df['score'] - proposal_df['score'].min()) / (
            proposal_df['score'].max() - proposal_df['score'].min())
    proposal_df['score'] = np.abs(proposal_df['score'] - 1.0)
    proposal_df = proposal_df.loc[proposal_df['score'].argsort()[::-1]]
    proposal_df = proposal_df.rename(columns={'n-frames': 'f-end'})
    proposal_df['f-end'] = proposal_df['f-init'] + proposal_df['f-end'] - 1
    return proposal_df.reset_index(drop=True)
def basic_compute_loop(compute_function,looper,run_parallel=True,debug=False):
	"""Canonical form of the basic compute loop."""
	start = time.time()
	if run_parallel:
		incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)(
			delayed(compute_function,has_shareable_memory)(**looper[ll]) 
			for ll in framelooper(len(looper),start=start))
	else: 
		incoming = []
		for ll in framelooper(len(looper)):
			incoming.append(compute_function(**looper[ll]))
	return incoming
def auto_choose(actionfile, new_xyz, softmin_k = 1, softmin_alpha = 1, nparallel=-1):
    """
    @param demofile  : h5py.File object
    @param new_xyz   : new rope point-cloud
    @param softmin   : use softmin distribution over first <softmin> demonstrations
                       set to 1 for nearest neighbor
    @param nparallel : number of parallel jobs to run for tps cost calculaion
                       set to -1 for no parallelization
    
    @return          : return the name of the segment with the lowest warping cost.
    """
    if not nparallel == -1:
        from joblib import Parallel, delayed
        nparallel = min(nparallel, 8)

    demo_data = actionfile.items()

    if nparallel != -1:
        before = time.time()
        redprint("auto choose parallel with njobs = %d"%nparallel)
        costs  = Parallel(n_jobs=nparallel, verbose=100)(delayed(registration_cost)(ddata[1]['cloud_xyz'][:], new_xyz) for ddata in demo_data)
        after  = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        redprint("auto choose sequential..")
        for i, ddata in enumerate(demo_data):
            costs.append(registration_cost(ddata[1]['cloud_xyz'][:], new_xyz))
            print(("tps-cost completed %i/%i" % (i + 1, len(demo_data))))
    
    # use a random draw from the softmin distribution
    demo_costs = zip(costs, demo_data)
    if softmin_k == 1:
        ibest = np.argmin(costs)
        return demo_data[ibest][0]
    best_k_demos = np.asarray(sorted(demo_costs)[:softmin_k])
    best_k_exps = np.exp(-1*softmin_alpha*float(best_k_demos[:, 0]))  #multiply by -1 b/c we're actually min-ing
    if len(best_k_exps) > 1:
        denom = sum(best_k_exps)
    else:
        denom = best_k_exps
    mass_fn = best_k_exps/denom

    draw = random.random()
    for i in range(best_k_demos):
        if draw <= mass_fn[i]:
            ret_val = demo_data[i][0]
            break
        draw -= mass_fn[i]
    
    redprint ("auto choose returning..")
    return ret_val
def extract_all_class_features(dataset, n_jobs=1, stride=5, patch_size=10):
    """Extract masked features from all dataset images, return features and labels"""
    cns = []
    labels = []
    for (label, cls) in enumerate(dataset.classes):
        print 'Extracting masked CNs from class {}'.format(cls)
        hists = Parallel(n_jobs=n_jobs)(delayed(extract_masked_cns)(imname, maskname) for (imname, maskname) in dataset.get_class_images(cls))
        hists = np.vstack(hists)
        labels.append(label * np.ones((len(hists),), dtype=np.float32))
        cns.append(hists.astype(np.float32))
    
    # Stack lists in numpy arrays.
    return (cns, labels)
Exemple #15
0
	def train(self):
		regressors = []
		if self.parallel:
			regressors = Parallel(n_jobs=-1)(delayed(trainBin)(self.params[b], np.atleast_2d(self.ind).T, self.dep[b],self.indWeights) for b in self.OD.bins)
		else:
			for b in self.OD.bins:
				regressors.append(trainBin(self.params[b],np.atleast_2d(self.ind).T, self.dep[b],self.indWeights))
				#self.svr[b] = SVR(cache_size=1000,kernel='rbf', C=self.params[b]['C'], gamma=self.params[b]['gamma'])
				#self.svr[b].fit(np.array([self.ind]).T,self.dep[b])
				
		
		for i,model in enumerate(regressors):
			self.svr[self.OD.bins[i]] = model
def create_training_data():
  num_cores = 8

  # getting total number of trips
  list_of_files = [[folder, f.replace('.csv','')] for folder in os.listdir('drivers') if 'DS_Store' not in folder
                 for f in os.listdir('drivers/'+folder) if '.csv' in f]

  raw_data = Parallel( n_jobs=num_cores )(delayed(create_attributes)(i) for i in list_of_files)
  raw_data = pd.DataFrame(raw_data)
  raw_data.columns = ['driver_trip','trip_time','total_distance','skyway_distance','avg_speed','std_speed',
                      'avg_speed_up','avg_speed_down',
                      'avg_acc','std_acc','avg_turn','std_turn','standing_time','standing_speed']
  # save to file for later training
  raw_data.to_csv('training_set.csv', index=False)
  return raw_data
Exemple #17
0
    def predict(self, test_set=True, location=None):
        Y, self.locations = self.data.get_y(location=location)
        t = self.data.observations['time'].values
        t = self._split_dataset(t, test_set=test_set)
        Y = self._split_dataset(Y, test_set=test_set)
        yhat_jobs = []
        ytrue =[]
        yoccur_jobs = []
        if not self.nearest_neighbor:
            X = self.data.get_X()
            X = self._split_dataset(X, test_set=test_set) 
            if self.xtransform is not None:
                X = self.xtrans.transform(X)
        for j, row in self.locations.iterrows():
            if self.nearest_neighbor:
                X = self.data.get_nearest_X(row[self.data.reanalysis_latdim],
                                   row[self.data.reanalysis_londim])

                X = self._split_dataset(X, test_set=test_set) 
                if self.xtransform is not None:
                    X = self.xtrans[j].transform(X)
            if self.conditional is not None:
                yoccur_jobs += [delayed(worker_predict_prob)(self.occurance_models[j], copy.deepcopy(X))]

            yhat_jobs += [delayed(worker_predict)(self.models[j], copy.deepcopy(X))]
            ytrue += [Y[:, j]]

        yhat = Parallel(n_jobs=self.num_proc)(yhat_jobs)
        if self.ytransform is not None:
            transform_jobs = [delayed(worker_invtrans)(self.ytrans[j], yhat[j]) for j in
                                                       range(len(yhat))]
            yhat = Parallel(n_jobs=self.num_proc)(transform_jobs)

        yhat = numpy.vstack(yhat).T
        ytrue = numpy.vstack(ytrue).T
        yhat = self.to_xarray(yhat, t).rename({"value": "projected"})
        ytrue = self.to_xarray(ytrue, t).rename({"value": "ground_truth"})
        if self.conditional is not None:
            yoccur = Parallel(n_jobs=self.num_proc)(yoccur_jobs)
            yoccur = numpy.vstack(yoccur).T > 0.5
            yoccur = self.to_xarray(yoccur, t).rename({"value": "occurance"})
            yhat['projected'] = yhat['projected']*yoccur['occurance']
            yhat = yhat.merge(yoccur)

        out = yhat.merge(ytrue) 
        out['error'] = out.projected - out.ground_truth
        return out
Exemple #18
0
def run_all(cnf, samples, process_one, finalize_one, finalize_all):
    if len(samples) == 1:
        sample_name, sample_cnf = samples.items()[0]
        run_one(sample_cnf, process_one, finalize_one)
    else:
        results = []
        if cnf.get('parallel'):
            try:
                from joblib import Parallel, delayed
            except ImportError:
                critical(
                    '\nERROR: Joblib not found. You may want samples to be processed '
                    'in parallel, in this case, make sure python joblib intalled. '
                    '(pip install joblib).')
            else:
                for sample_name, sample_cnf in samples.items():
                    sample_cnf['verbose'] = False

                results = Parallel(n_jobs=len(samples)) \
                    (delayed(run_one)(sample_cnf, process_one, finalize_one,
                                      multiple_samples=True)
                        for sample_name, sample_cnf in samples.items())
        else:
            results = []
            for sample_name, sample_cnf in samples.items():
                results.append(
                    run_one(sample_cnf, process_one, finalize_one,
                            multiple_samples=True))

        if samples:
            info('')
            info('*' * 70)
            info('Results for each sample:')
            finalize_all(cnf, samples, results)

    # Cleaning
    for name, data in samples.items():
        work_dirpath = data['work_dir']
        tx_dirpath = join(work_dirpath, 'tx')

        if isdir(tx_dirpath):
            shutil.rmtree(tx_dirpath)

        if not data.get('keep_intermediate') \
                and isdir(work_dirpath):
            shutil.rmtree(work_dirpath)
def get_best_matches(img, ids, kps, des):
	number_of_results = 3

	src_kp, src_des = get_kp_desc(img)
	os.remove(img)
	matches = []

	matches = Parallel(n_jobs=-1)(delayed(match_gen)([kp_to_list(src_kp), src_des, kp_to_list(kps[i]), des[i], ids[i]]) for i in range(len(ids)))
	
	# remove product ids that have 0 matches
	for elem in matches[:]:
		if elem[1] == 0:
			matches.remove(elem)

	# sort by ids in order to remove duplicate ids for pics with less matches of the same product
	matches = sorted(matches, key=lambda tup: tup[0])

	# and remove possible product id duplicates
	# that may appear from the match-making algorithm applied 
	# on different picture keypoints of the same product
	s = set()
	for elem in matches[:]:
		if elem[0] in s:
			matches.remove(elem)
		else:
			s.add(elem[0])

	# sort by number of matches
	matches = sorted(matches, key=lambda tup: tup[1])
	matches.reverse()

	# return the first number_of_results most matching
	return [i for i in matches[:number_of_results]]
Exemple #20
0
def svm_ova_from_kernel(ktrain, train_labels,
                        ktest, test_labels,
                        C=DEFAULT_REGULARIZATION,
                        bkg_categories=None):

    def sighandler_svm(signum, frame):
        logger.info('Caught signal %i while training SVMs in paralell.'
                    % signum)

    signal.signal(signal.SIGTERM, sighandler_svm)

    n_test = ktest.shape[0]

    categories = np.unique(train_labels)

    # -- remove background categories
    if bkg_categories is not None:
        categories = list(set(categories).difference(set(bkg_categories)))

    n_categories = len(categories)

    cat_index = {}
    predictions = np.empty((n_test, n_categories))

    # -- train OVA SVMs in parallel
    predictions = Parallel(n_jobs=-1) (delayed(one_svm) (ktrain,
                                                  train_labels.reshape(-1),
                                                  ktest,
                                                  cat, C)
           for cat in categories)

    predictions = np.array(predictions).T

    # -- iterates over categories
    for icat, cat in enumerate(categories):
        cat_index[cat] = icat

    gt = np.array([cat_index[e]
                        for e in test_labels.reshape(-1)]).astype('int')
    pred = predictions.argmax(axis=1)
    acc = (pred == gt).sum() / float(n_test)

    return acc, predictions, gt
def auto_choose(demofile, new_xyz, only_original_segments):
    """
    @param demofile:
    @param new_xyz:
    @param only_original_segments: if true, then only the original_segments will be registered with
    @return:
    """
    import pprint

    """Return the segment with the lowest warping cost. Takes about 2 seconds."""
    parallel = True
    if parallel:
        from joblib import Parallel, delayed
    items = demofile.items()
    if only_original_segments:
        #remove all derived segments from items
        print("Only registering with the original segments")
        items = [item for item in items if not "derived" in item[1].keys()]
    unzipped_items = zip(*items)
    keys = unzipped_items[0]
    values = unzipped_items[1]
    ds_clouds, shapes = get_downsampled_clouds(values)
    ds_new = clouds.downsample(new_xyz, 0.01 * DS_SIZE)
    #print 'ds_new_len shape', ds_new.shape
    if parallel:
        before = time.time()
        #TODO: change back n_jobs=12 ?
        costs = Parallel(n_jobs=8, verbose=0)(delayed(registration_cost)(ds_cloud, ds_new) for ds_cloud in ds_clouds)
        after = time.time()
        print "Parallel registration time in seconds =", after - before
    else:
        costs = []
        for (i, ds_cloud) in enumerate(ds_clouds):
            costs.append(registration_cost(ds_cloud, ds_new))
            print(("completed %i/%i" % (i + 1, len(ds_clouds))))
            #print(("costs\n", costs))
    ibest = np.argmin(costs)
    print "ibest = ", ibest
    #pprint.pprint(zip(keys, costs, shapes))
    #print keys
    print "best key = ", keys[ibest]
    print "best cost = ", costs[ibest]
    return keys[ibest]
def findPeaks(imgdict, maplist, params, maptype="ccmaxmap", pikfile=True):
	peaktreelist = []
	count = 0

	thresh =    float(params["thresh"])
	bin =       int(params["bin"])
	diam =      float(params["diam"])
	apix =      float(params["apix"])
	olapmult =  float(params["overlapmult"])
	maxpeaks =  int(params["maxpeaks"])
	maxthresh = params["maxthresh"]
	maxsizemult = float(params["maxsize"])
	peaktype =  params["peaktype"]
	msg =       not params['background']
	pixdiam =   diam/apix/float(bin)
	pixrad =    diam/apix/2.0/float(bin)

	numpyVersion = float(numpy.version.version[:3])
	if numpyVersion > 1.7:
		peaktreelist = Parallel(n_jobs=params['nproc'])(delayed(runFindPeaks)(params,
			maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,maxpeaks,maxsizemult,
			msg,bin,peaktype,pixrad,imgdict) for count in range(0,len(maplist)))
	else:
		## backup for AttributeError: 'memmap' object has no attribute 'offset', bug #3322
		peaktreelist = []
		for count in range(0,len(maplist)):
			mappeaktree = runFindPeaks(params,maplist,maptype,pikfile,thresh,pixdiam,count,olapmult,
				maxpeaks,maxsizemult,msg,bin,peaktype,pixrad,imgdict)
			peaktreelist.append(mappeaktree)

	peaktree = mergePeakTrees(imgdict, peaktreelist, params, msg, pikfile)

	#max threshold
	if maxthresh is not None:
		precount = len(peaktree)
		peaktree = maxThreshPeaks(peaktree, maxthresh)
		postcount = len(peaktree)
		#if precount != postcount:
		apDisplay.printMsg("Filtered %d particles above threshold %.2f"%(precount-postcount,maxthresh))

	return peaktree
Exemple #23
0
def main():
    """
    Main function.

    1. Setup logging
    2. Get arguments
    3. Get index
    4. Process files
    5. Write output
    """

    setup_logging()

    logger = logging.getLogger("stats." + __name__)

    args = get_args()

    index = get_index(args)

    logger.warning("Positions not in annotation will be ignored.")

    logger.info("Found " + str(len(args.inputs)) + " input file(s):")
    for input_file in sorted(args.inputs):
        logger.debug(input_file)

    if args.is_parallel:
        stats = Parallel(n_jobs=args.parallel,
                         verbose=100,
                         batch_size=1)(delayed(process_file)(input_file,
                                                             args.type,
                                                             index,
                                                             args.is_parallel)
                                       for input_file in args.inputs)
    else:
        stats = []
        for input_file in args.inputs:
            output_table = process_file(input_file, args.type, index,
                                        args.is_parallel)
            stats.append(output_table)

    write_stats(args.out, stats)
Exemple #24
0
 def summary(self, count=32, out_table=None, prange=None, pjob=4):
     if self.ndims > 0:
         if_exists = "replace"
         if out_table == None:
             out_table = "%s_%s_summary" %(self.table, self.name)
             out_table = out_table.replace("[", "_").replace("]", "_")
         def query(i):
             self.logger.info("Processing column %d (of %d)" %(i, self.shape[0]))
             query = self[i].data.alias(name="col")
             q1 = sa.select([sa.text("madlib.fmsketch_dcount(col) as count"),
                             sa.text("madlib.mfvsketch_top_histogram(col, %s) as top" %count)]).select_from(query)
             return [q1, i]
         if prange == None:
             prange = range(1, self.shape[0] + 1)
         queries = [query(i) for i in prange]
         dfs = Parallel(n_jobs=pjob)(delayed(process_query)(q) for q in queries)
         dfs = pd.concat(dfs)
         dfs.index = prange
         dfs["table"] = self.table
         dfs["column"] = self.name
         return dfs
Exemple #25
0
 def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts',
               minlen=3, drop_gamma=False, n_jobs='auto'):
     '''
     Finds TADs in data with a list of gammas. Returns a pandas DataFrame
     with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on
     the returned object to get coordinates in bed-style format and not in
     coordinates of concatenated genome.
     If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma)
     '''
     raise DeprecationWarning('Will be deprecated or rewritten to use'\
                             'lavaburst: github.com/nezar-compbio/lavaburst')
     if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs
         if segmentation == 'potts':
             n_jobs = 3
         elif segmentation == 'armatus':
             n_jobs = 6
     if ~np.isfinite(data).any():
         print 'Non-finite values in data, substituting them with zeroes'
         data[~np.isfinite(data)] = 0
     Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data)
     f = _calculate_TADs
     if n_jobs >= 1:
         from joblib import Parallel, delayed
         domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)(
                           delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation)
                                                                    for g in gammalist)
     elif n_jobs is None or n_jobs == False or n_jobs == 0:
         domains = []
         for g in gammalist:
             domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation)
             domains.append(domains_g)
     domains = pd.concat(domains, ignore_index=True)
     domains = domains.query('End-Start>='+str(minlen)).copy()
     domains = domains.sort(columns=['Gamma', 'Start', 'End'])
     domains.reset_index(drop=True, inplace=True)
     domains[['Start', 'End']] = domains[['Start', 'End']].astype(int)
     domains[['Start', 'End']] *= self.resolution
     domains = domains[['Start', 'End', 'Score', 'Gamma']]
     if drop_gamma:
         domains.drop('Gamma', axis=1, inplace=True)
     domains = self.genome_intervals_to_chr(domains).reset_index(drop=True)
     return domains
        out_scores[key] = scores
    out_scores['repeat_idx'] = repeat_idx
    out_predictions['repeat_idx'] = repeat_idx
    return out_scores, out_predictions


DEBUG = False
if DEBUG:
    N_JOBS = 1
    stacked_keys = {'MEG all': meg_powers + meg_cross_powers + meg_handcrafted}

drop_na_scenario = (False, 'local', 'global')
for drop_na in drop_na_scenario[:1 if DEBUG else len(drop_na_scenario)]:
    out = Parallel(n_jobs=N_JOBS)(delayed(run_stacked)(data.query(
        f"repeat == {ii}"), stacked_keys, ii, drop_na)
                                  for ii in range(N_REPEATS))
    out = zip(*out)

    out_scores_meg = next(out)
    out_scores_meg = pd.concat(out_scores_meg, axis=0)
    out_scores_meg.to_csv(SCORES.format('meg' +
                                        drop_na if drop_na else '_na_coded'),
                          index=True)

    out_predictions_meg = next(out)
    out_predictions_meg = pd.concat(out_predictions_meg, axis=0)
    out_predictions_meg.to_csv(
        OUT_PREDICTIONS.format('meg' + drop_na if drop_na else '_na_coded'),
        index=True)
def optimze_func(start_pt):
    result_x, result_f, output = scipy.optimize.fmin_l_bfgs_b(
        func=negative_ei_func,
        x0=start_pt,
        fprime=None,
        args=(),
        approx_grad=True,
        bounds=obj_func_min._search_domain,
        m=10,
        factr=10.0,
        pgtol=1e-10,
        epsilon=1e-08,
        iprint=-1,
        maxfun=15000,
        maxiter=15000,
        disp=0,
        callback=None)
    print output
    return result_x, result_f


with Parallel(n_jobs=50) as parallel:
    parallel_results = parallel(
        delayed(optimze_func)(pt) for pt in start_points)
min_negative_ei = numpy.inf
for i in range(len(parallel_results)):
    if min_negative_ei > parallel_results[i][1]:
        min_negative_ei = parallel_results[i][1]
        best_pt = parallel_results[i][0]
    parser.add_argument('--batch_size', type=int, default=1000)
    args = parser.parse_args()

    meta = pd.read_csv(args.meta_file)
    object_ids = meta['object_id'].unique()

    if not os.path.exists(args.temporary_directory):
        os.mkdir(args.temporary_directory)

    object_id_batches = []
    object_id_batch_count = int(
        math.ceil(len(object_ids) / float(args.batch_size)))
    for batch in range(object_id_batch_count):
        batch_ids = object_ids[batch * args.batch_size:][:args.batch_size]
        object_id_batches.append(batch_ids)

    signal_reader = SignalReader(args.signal_file)
    fft_features_files = Parallel(n_jobs=args.process_count)(
        delayed(extract_df_features)
        (pickle.dumps(signal_reader.objects_signals(objects_ids)),
         os.path.join(args.temporary_directory, 'batch-{0}.csv'.format(batch)))
        for batch, objects_ids in enumerate(tqdm(object_id_batches)))
    signal_reader.close()

    assert len(fft_features_files) > 0
    features = pd.read_csv(fft_features_files[0])
    for filename in tqdm(fft_features_files[1:]):
        features = pd.concat([features, pd.read_csv(filename)], sort=True)
        os.remove(filename)
    features.to_csv(args.target_file, index=None)
Exemple #29
0
def compute_metrics(label, pred):
    res = Parallel(n_jobs=16)(delayed(corr)(pred, label, i)
                              for i in range(label.shape[1]))
    return res
def aTimes2CorrsParallel(data, listOfCorr, accuracy=50, taumax="auto", performCoarsening=True, split=10):
    """
    Calculate correlations between several photon streams with arrival times
    stored in macrotimes, using parallel computing to speed up the process
    ==========  ===============================================================
    Input       Meaning
    ----------  ---------------------------------------------------------------
    data        Object having fields det0, det1, ..., det24 which contain
                the macrotimes of the photon arrivals [in a.u.]
    listOfCorr  List of correlations to calculate
    split       Chunk size [s]
    ==========  ===============================================================
    Output      Meaning
    ----------  ---------------------------------------------------------------
    G           [N x 2] matrix with tau and G values
    ==========  ===============================================================
    """
    
    if taumax == "auto":
        taumax = 1 / data.macrotime
    
    G = correlations()
    
    Ndet = 21
    calcAv = False
    if 'av' in listOfCorr:
        # calculate the correlations of all channels and calculate average
        listOfCorr.remove('av')
        listOfCorr += list(range(Ndet))
        calcAv = True
    
    for corr in listOfCorr:
        print("Calculating correlation " + str(corr))
        
        # EXTRACT DATA
        if type(corr) == int:
            dataExtr = getattr(data, 'det' + str(corr))
            t0 = dataExtr[:, 0]
            corrname = 'det' + str(corr)
        elif corr == "sum5" or corr == "sum3":
            print("Extracting and sorting photons")
            dataExtr = extractSpadPhotonStreams(data, corr)
            t0 = dataExtr[:, 0]
            corrname = corr
        
        # CALCULATE CORRELATIONS
        duration = t0[-1] * data.macrotime
        Nchunks = int(np.floor(duration / split))
        # go over all filters
        for j in range(np.shape(dataExtr)[1] - 1):
            print("   Filter " + str(j))
            if j == 0:
                Processed_list = Parallel(n_jobs=multiprocessing.cpu_count() - 1)(delayed(parallelG)(t0, [1], data.macrotime, j, split, accuracy, taumax, performCoarsening, chunk) for chunk in list(range(Nchunks)))
            else:
                w0 = dataExtr[:, j+1]
                Processed_list = Parallel(n_jobs=multiprocessing.cpu_count() - 1)(delayed(parallelG)(t0, w0, data.macrotime, j, split, accuracy, taumax, performCoarsening, chunk) for chunk in list(range(Nchunks)))
            
            for chunk in range(Nchunks):
                setattr(G, corrname + "F" + str(j) + '_chunk' + str(chunk), Processed_list[chunk])
           
            # average over all chunks
            listOfFields = list(G.__dict__.keys())
            listOfFields = [i for i in listOfFields if i.startswith(corrname + "F" + str(j) + "_chunk")]
            Gav = sum(getattr(G, i) for i in listOfFields) / len(listOfFields)
            setattr(G, corrname + "F" + str(j) + '_average', Gav)
    
    if calcAv:
        # calculate average correlation of all detector elements
        for f in range(np.shape(dataExtr)[1] - 1):
            # start with correlation of detector 20 (last one)
            Gav = getattr(G, 'det' + str(Ndet-1) + 'F' + str(f) + '_average')
            # add correlations detector elements 0-19
            for det in range(Ndet - 1):
                Gav += getattr(G, 'det' + str(det) + 'F' + str(f) + '_average')
            # divide by the number of detector elements to get the average
            Gav = Gav / Ndet
            # store average in G
            setattr(G, 'F' + str(f) + '_average', Gav)
    
    return G
			px = int((mx - gt[0]) / gt[1]) #x pixel
			py = int((my - gt[3]) / gt[5]) #y pixel			############### print "[ RASTER BAND COUNT ]: ", src_ds.RasterCount
			
			for band in range( src_ds.RasterCount ):
				band += 1
				srcband = src_ds.GetRasterBand(band)
				structval = srcband.ReadRaster(px,py,1,1,buf_type=srcband.DataType )	
				bandtype = gdal.GetDataTypeName(srcband.DataType)
				intval = struct.unpack(fmttypes[bandtype] , structval)
				val=int(intval[0])*0.1
				values.append(var+"_"+str(i)+"-"+str(band)+"_"+str(val))
	return values 


num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in range (yi, yf + 1, 1))
			
print results	


# mx,my=float(lon), float(lat)  #coord in map units # mx,my=-74.930451, 5.363346  #coord in map units
# i=205
# values=[]
# for var in varslist:	
	# filetif= dirbase +"//"+var+"//"+var+"_"+str(i)+ ".tif"
	# if os.path.exists(filetif):	
		# print filetif
		# src_filename = filetif
		# src_ds=gdal.Open(src_filename) 
		# gt=src_ds.GetGeoTransform()
		# px = int((mx - gt[0]) / gt[1]) #x pixel
Exemple #32
0
def main():
    parser = argparse.ArgumentParser(description='Register & align images')
    parser.add_argument(
        'filenames',
        nargs='+',
        help=
        'List of target files to register. Images are aligned to first in list.'
    )
    parser.add_argument('-odir',
                        metavar='outdir',
                        required=True,
                        type=str,
                        help='Output directory for files.')
    parser.add_argument(
        '-m',
        metavar='method',
        choices=('point', 'extended'),
        default='extended',
        help='Specify alignment method (point or extended); default=extended.')
    parser.add_argument(
        '-xy',
        nargs=2,
        type=float,
        default=None,
        help=
        'Specify approximate "x y" pixel coordinate of object to centroid on.  Required for point mode; useful for extended mode (default=center of image).'
    )
    parser.add_argument(
        '-box',
        nargs=2,
        type=int,
        default=None,
        help=
        'Specify box size (w h) to restrict alignment search.  Useful for both point & extended modes (default=full size of array).'
    )
    parser.add_argument('--c',
                        action='store_true',
                        help='Clobber (overwrite) on output')
    parser.add_argument(
        '-njobs',
        type=int,
        default=1,
        help='Process images in parallel. "-1" is all CPUs (default=1).')

    args = parser.parse_args()

    if args.m == 'point' and args.xy is None:
        parser.error("-m point requires -xy coordinate")

    # create output directory
    if args.odir not in ['', '.']:
        makedirs(args.odir, exist_ok=True)

    # align all images to first filename
    ref = args.filenames[0]
    align = args.filenames[1:]

    imref = partial(register,
                    ref=ref,
                    outdir=args.odir,
                    method=args.m,
                    center=args.xy,
                    size=args.box,
                    overwrite=args.c)

    outfiles = Parallel(n_jobs=args.njobs,
                        verbose=11)(delayed(imref)(toshift=a) for a in align)

    # Write ref to outdir
    refnew = os.path.join(args.odir, os.path.basename(ref))
    copy(ref, refnew)

    outfiles.append(refnew)
    print('Wrote %i files to %s' % (len(outfiles), args.odir))
Exemple #33
0
def mercat_main():
    __args__, m_parser = parseargs()

    kmer = __args__.k
    num_cores = __args__.n
    m_inputfile = __args__.i
    m_inputfolder = __args__.f
    prune_kmer = __args__.c
    mflag_fastq = __args__.q
    mflag_prodigal = __args__.p
    mflag_trimmomatic = __args__.t
    mflag_protein = __args__.pro
    mfile_size_split = __args__.s

    kmerstring = str(kmer) + "-mers"

    if not mfile_size_split:
        mfile_size_split = 100

    np_string = "nucleotide"
    if mflag_protein or mflag_prodigal: np_string = "protein"
    def_option =  not __args__.p and not __args__.q and not __args__.pro

    all_ipfiles = []
    if m_inputfolder:
        m_inputfolder = os.path.abspath(m_inputfolder)
        os.chdir(m_inputfolder)
        #Assume all have same ext
        for fname in os.listdir(m_inputfolder):
            mip = os.path.join(m_inputfolder, fname)
            if not os.path.isdir(mip):
                # skip directories
                all_ipfiles.append(mip)

    else:
        #m_inputfolder = os.getcwd()
        m_inputfolder = os.path.dirname(os.path.abspath(m_inputfile))
        all_ipfiles.append(os.path.abspath(m_inputfile))

    top10_all_samples = dict()
    for m_inputfile in all_ipfiles:

        os.chdir(m_inputfolder)
        check_args(m_inputfile,__args__,def_option,m_parser)

        m_inputfile = os.path.abspath(m_inputfile)

        sample_name = os.path.splitext(os.path.basename(m_inputfile))[0]
        basename_ipfile = os.path.splitext(os.path.basename(m_inputfile))[0] + "_" + np_string

        inputfile_size = os.stat(m_inputfile).st_size
        dir_runs = "mercat_results/" + basename_ipfile + "_run"

        if os.path.exists(dir_runs):
            shutil.rmtree(dir_runs)
        os.makedirs(dir_runs)

        all_chunks_ipfile = []
        is_chunked = False
        if inputfile_size >= (mfile_size_split*1024*1024): #100MB
            print("Large input file provided: Splitting it into smaller files...\n")
            mercat_chunker(m_inputfile,dir_runs,str(mfile_size_split)+"M",">")
            os.chdir(dir_runs)
            all_chunks_ipfile = glob.glob("*")
            is_chunked=True
        else:
            os.chdir(dir_runs)
            all_chunks_ipfile.append(m_inputfile)

        #print all_chunks_ipfile
        #sys.exit(1)


        splitSummaryFiles = []

        for inputfile in all_chunks_ipfile:

            bif = os.path.splitext(os.path.basename(inputfile))[0] + "_" + np_string

            '''trimmomatic SE -phred33 test.fq Out.fastq ILLUMINACLIP:TruSeq2-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:30 MINLEN:50'''
            if mflag_trimmomatic:
                swq = mflag_trimmomatic
                trimmed_file = bif+"_trimmed.fq"
                prod_cmd = "trimmomatic SE -phred33 %s %s ILLUMINACLIP:TruSeq2-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:%s MINLEN:50" %(inputfile,trimmed_file,swq)
                with open(os.devnull, 'w') as FNULL:
                    subprocess.call(prod_cmd, stdout=FNULL, stderr=FNULL, shell=True)
                inputfile = trimmed_file

            "Run prodigal if specified"
            '''prodigal -i test_amino-acid.fa -o output.gff -a output.orf_pro.faa  -f gff -p meta -d output.orf_nuc'''
            if mflag_prodigal:
                mflag_protein = True
                gen_protein_file = bif+"_pro.faa"
                prod_cmd = "prodigal -i %s -o %s -a %s -f gff -p meta -d %s" % (
                inputfile, bif + ".gff", gen_protein_file, bif + "_nuc.ffn")

                if mflag_fastq and mflag_trimmomatic:
                    trimfna = bif + "_trimmed.fna"
                    sequences = OrderedDict()
                    with open(inputfile, 'r') as f:
                        seq = ""
                        sname = ""
                        for line in f:
                            line = line.strip()
                            if line.startswith("@"):
                                sname = line[1:].split()[0]
                            elif line.startswith("+"):
                                if seq:
                                    sequences[sname] = seq
                                    seq = ""
                            else:
                                if sname not in sequences: seq = line

                    fnastring = ""

                    for sname in sequences:
                        fnastring += ">"+sname+"\n"
                        fnastring += sequences[sname]+"\n"

                    with open(trimfna, 'w') as f: f.write(fnastring)

                    gen_protein_file = bif + "_trimmed_pro.faa"
                    prod_cmd = "prodigal -i %s -o %s -a %s -f gff -p meta -d %s" % (
                    trimfna, bif + ".gff", gen_protein_file, bif + "_nuc.ffn")

                print(prod_cmd)
                with open(os.devnull, 'w') as FNULL:
                    subprocess.call(prod_cmd, stdout=FNULL, stderr=FNULL, shell=True)
                inputfile = gen_protein_file

            print("Running mercat using " + str(num_cores) + " cores")
            print("input file: " + inputfile)

            sequences = OrderedDict()
            is_fastq = False


            start_time = timeit.default_timer()


            with open(inputfile,'r') as f:
                for line in f:
                    if line.startswith(">"): break
                    elif line.startswith("@"):
                        is_fastq = True
                        break



            with open(inputfile,'r') as f:
                if not is_fastq:
                    seq = ""
                    sname = ""
                    for line in f:
                        line = line.strip()
                        if line.startswith(">"):
                            if sname: sequences[sname] = ""
                            if seq:
                                sequences[sname] = seq
                                seq = ""
                            sname = line[1:]
                            sname = sname.split("#",1)[0].strip()
                        else:
                            line = line.replace("*","")
                            seq += line

                    #assert sname and seq
                    sequences[sname] = seq

                else: #process fastq file
                    seq = ""
                    sname = ""
                    for line in f:
                        line = line.strip()
                        if line.startswith("@"):
                            sname = line[1:].split()[0]
                        elif line.startswith("+"):
                            if seq:
                                sequences[sname] = seq
                                seq = ""
                        else:
                            if sname not in sequences: seq = line


            #print sequences.keys()[0] + "="+ sequences.values()[0]

            print("Number of sequences in " + inputfile + " = "+ str(humanize.intword(len(sequences))))


            # results = Parallel(n_jobs=num_cores)(
            #     delayed(calculateKmerCount)(seq, sequences[seq], prune_kmer, kmer) for seq in sequences)
            results = Parallel(n_jobs=num_cores)(
                delayed(calculateKmerCount)(sequences[seq], kmer) for seq in sequences)


            kmerlist = dict()
            #kmerlist_all_seq = dict()

            for d in results:
                for k,v in list(d.items()):
                    if k in kmerlist:
                        kmerlist[k] += v
                    else: kmerlist[k] = v

            # for d in results:
            #     for seq,kdict in list(d[1].items()):
            #         #assert seq not in kmerlist_all_seq
            #         kmerlist_all_seq[seq] = kdict#.copy()

            print("Time to compute " + kmerstring +  ": " + str(round(timeit.default_timer() - start_time,2)) + " secs")

            significant_kmers = []
            for k in kmerlist:
                if kmerlist[k] >= prune_kmer: significant_kmers.append(k)

            print("Total number of " + kmerstring +  " found: " + str(humanize.intword(len(kmerlist))))
            print(kmerstring +  " with count >= " + str(prune_kmer) + ": " + str(humanize.intword(len(significant_kmers))))

            #df = df.ix[df[bif] >= prune_kmer]


            if mflag_protein:
                df = pd.DataFrame(0.0, index=significant_kmers, columns=['Count',"PI","MW","Hydro"])
                for k in significant_kmers:
                    df.set_value(k, 'Count', kmerlist[k])
                    df.set_value(k,"PI", predict_isoelectric_point_ProMoST(k))
                    df.set_value(k, "MW", calculate_MW(k))
                    df.set_value(k, "Hydro", calculate_hydro(k))

                df.to_csv(bif + "_summary.csv", index_label=kmerstring, index=True)
            else:
                df = pd.DataFrame(0, index=significant_kmers, columns=['Count',"GC_Percent","AT_Percent"])
                for k in significant_kmers:
                    c_kmer = k
                    df.set_value(k, 'Count', kmerlist[k])
                    len_cseq = float(len(c_kmer))
                    df.set_value(k, "GC_Percent", round(((c_kmer.count("G")+c_kmer.count("C")) / len_cseq) * 100.0))
                    df.set_value(k, "AT_Percent", round(((c_kmer.count("A")+c_kmer.count("T")) / len_cseq) * 100.0))

                df.to_csv(bif + "_summary.csv", index_label=kmerstring, index=True)

            splitSummaryFiles.append(bif + "_summary.csv")

            # dfcol = significant_kmers
            #
            #
            # if not mflag_protein:
            #     dfcol.extend(["length","GC_Percent","AT_Percent"])
            #
            #     df = pd.DataFrame(0,index=list(sequences.keys()),columns=dfcol)
            #
            #     for seq in sequences:
            #         cseq = sequences[seq]
            #         len_cseq = float(len(cseq))
            #         df.set_value(seq, "length", int(len_cseq))
            #         df.set_value(seq, "GC_Percent", round(((cseq.count("G")+cseq.count("C")) / len_cseq) * 100.0))
            #         df.set_value(seq, "AT_Percent", round(((cseq.count("A")+cseq.count("T")) / len_cseq) * 100.0))
            #         for ss in kmerlist_all_seq[seq]:
            #             df.set_value(seq, ss, kmerlist_all_seq[seq][ss])
            #
            #         #df = df.loc[:, df.max() >= prune_kmer]
            #         df1 = df.ix[:,['length','GC_Percent','AT_Percent']]
            #         del df['length']
            #         del df['GC_Percent']
            #         del df['AT_Percent']
            #         df = df.loc[:, df.max() >= prune_kmer]
            #         df.loc[:, 'length'] = df1.ix[:,'length']
            #         df.loc[:, 'GC_Percent'] = df1.ix[:,'GC_Percent']
            #         df.loc[:, 'AT_Percent'] = df1.ix[:,'AT_Percent']
            #
            #
            # else:
            #
            #     dfcol.extend(["length", "PI", "MW","Hydro"])
            #
            #     df = pd.DataFrame(0, index=list(sequences.keys()), columns=dfcol)
            #
            #     for seq in sequences:
            #         cseq = sequences[seq]
            #         cseq=cseq.replace('*','')
            #         len_cseq = float(len(cseq))
            #         df.set_value(seq, "length", int(len_cseq))
            #         df.set_value(seq, "PI", predict_isoelectric_point_ProMoST(cseq))
            #         df.set_value(seq, "MW", calculate_MW(cseq))
            #         df.set_value(seq, "Hydro", calculate_hydro(cseq))
            #         for ss in kmerlist_all_seq[seq]:
            #             df.set_value(seq, ss, kmerlist_all_seq[seq][ss])
            #
            #         #df = df.loc[:,df.max() >= prune_kmer]
            #         df1 = df.ix[:,['length','PI','MW','Hydro']]
            #         del df['length']
            #         del df['PI']
            #         del df['MW']
            #         del df['Hydro']
            #         df = df.loc[:, df.max() >= prune_kmer]
            #         df.loc[:, 'length'] = df1.ix[:,'length']
            #         df.loc[:, 'PI'] = df1.ix[:,'PI']
            #         df.loc[:, 'MW'] = df1.ix[:,'MW']
            #         df.loc[:, 'Hydro'] = df1.ix[:, 'Hydro']
            #
            # df.to_csv(bif+".csv",index_label='Sequence',index=True)

            print("Total time: " + str(round(timeit.default_timer() - start_time,2)) + " secs")


        num_chunks = len(all_chunks_ipfile)
        df = dd.read_csv(splitSummaryFiles)
        dfgb = df.groupby(kmerstring).sum()
        df10 = dfgb.nlargest(10,'Count').compute()
        dfsum = dfgb.sum(0).compute()

        dfgb.to_csv("./" + basename_ipfile + "_finalSummary*.csv", index_label=kmerstring, name_function=name)

        if mflag_protein:
            df10[['PI', 'MW', 'Hydro']] = df10[['PI', 'MW', 'Hydro']] / num_chunks
        else:
            df10[['GC_Percent', 'AT_Percent']] = df10[['GC_Percent', 'AT_Percent']] / num_chunks

        top10_all_samples[sample_name] = [df10,dfsum.Count]

        all_counts = dfgb.Count.values.compute().astype(int)
        mercat_compute_alpha_beta_diversity(all_counts,basename_ipfile)

        if is_chunked:
            for tempfile in all_chunks_ipfile:
                os.remove(tempfile)
            for sf in splitSummaryFiles:
                os.remove(sf)

    plots_dir = m_inputfolder+"/mercat_results/plots"
    if os.path.exists(plots_dir):
        shutil.rmtree(plots_dir)
    os.makedirs(plots_dir)
    os.chdir(plots_dir)

    for basename_ipfile in top10_all_samples:
        df10,_ = top10_all_samples[basename_ipfile]
        if mflag_protein:
            mercat_scatter_plots(basename_ipfile, 'PI', df10, kmerstring)
            mercat_scatter_plots(basename_ipfile, 'MW', df10, kmerstring)
            mercat_scatter_plots(basename_ipfile, 'Hydro', df10, kmerstring)
        else:
            mercat_scatter_plots(basename_ipfile, 'GC_Percent', df10, kmerstring)
            mercat_scatter_plots(basename_ipfile, 'AT_Percent', df10, kmerstring)

    sbname = os.path.basename(m_inputfolder)
    if len(all_ipfiles) == 1: sbname = os.path.basename(all_ipfiles[0])
    mercat_stackedbar_plots(sbname,top10_all_samples, 'Count', kmerstring)
def anisotropic_smooth(inpd, fiber_distance_threshold, points_per_fiber=30, n_jobs=2, cluster_max = 10):
    """ Average nearby fibers.
    
    The pairwise fiber distance matrix is computed, then fibers
    are averaged with their neighbors until an edge (>max_fiber_distance) is encountered.

    """

    # polydata to array conversion, fixed-length fiber representation
    current_fiber_array = fibers.FiberArray()
    current_fiber_array.points_per_fiber = points_per_fiber
    current_fiber_array.convert_from_polydata(inpd)
    original_number_of_fibers = current_fiber_array.number_of_fibers
    
    # fiber list data structure initialization for easy fiber averaging
    curr_count = list()
    curr_fibers = list()
    curr_indices = list()
    for lidx in range(0, current_fiber_array.number_of_fibers):
        curr_fibers.append(current_fiber_array.get_fiber(lidx))
        curr_count.append(1)
        curr_indices.append(list([lidx]))
        
    converged = False
    iteration_count = 0
    
    while not converged:
        print "<filter.py> ITERATION:", iteration_count, "SUM FIBER COUNTS:", numpy.sum(numpy.array(curr_count))
        print "<filter.py> number indices", len(curr_indices)
        
        # fiber data structures for output of this iteration
        next_fibers = list()
        next_count = list()
        next_indices = list()
        
        # information for this iteration
        done = numpy.zeros(current_fiber_array.number_of_fibers)
        fiber_indices = range(0, current_fiber_array.number_of_fibers)

        # if the maximum number of fibers have been combined, stop averaging this fiber
        done[numpy.nonzero(numpy.array(curr_count) >= cluster_max)] = 1
        
        # pairwise distance matrix
        if USE_PARALLEL:
            distances = Parallel(n_jobs=n_jobs, verbose=1)(
                delayed(similarity.fiber_distance)(
                current_fiber_array.get_fiber(lidx),
                current_fiber_array,
                0, 'Hausdorff')
                for lidx in fiber_indices)
            distances = numpy.array(distances)
        else:
            distances = \
                numpy.zeros(
                (current_fiber_array.number_of_fibers,
                 current_fiber_array.number_of_fibers))
            for lidx in fiber_indices:
                distances[lidx, :] = \
                    similarity.fiber_distance(
                        current_fiber_array.get_fiber(lidx),
                        current_fiber_array, 0, 'Hausdorff')

        # distances to self are not of interest
        for lidx in fiber_indices:
            distances[lidx,lidx] = numpy.inf
        
        # sort the pairwise distances. 
        distances_flat = distances.flatten()
        pair_order = numpy.argsort(distances_flat)

        print "<filter.py> DISTANCE MIN:", distances_flat[pair_order[0]], \
            "DISTANCE COUNT:", distances.shape

        # if the smallest distance is greater or equal to the
        # threshold, we have converged
        if distances_flat[pair_order[0]] >= fiber_distance_threshold:
            converged = True
            print "<filter.py> CONVERGED"
            break
        else:
            print "<filter.py> NOT CONVERGED"
            
        # loop variables
        idx = 0
        pair_idx = pair_order[idx]
        number_of_fibers = distances.shape[0]
        number_averages = 0
        
        # combine nearest neighbors unless done, until hit threshold
        while distances_flat[pair_idx] < fiber_distance_threshold:
            # find the fiber indices corresponding to this pairwise distance
            # use div and mod
            f_row = pair_idx / number_of_fibers
            f_col = pair_idx % number_of_fibers

            # check if this neighbor pair can be combined
            combine = (not done[f_row]) and (not done[f_col])
            if combine :
                done[f_row] += 1
                done[f_col] += 1
                # weighted average of the fibers (depending on how many each one represents)
                next_fibers.append(
                    (curr_fibers[f_row] * curr_count[f_row] + \
                     curr_fibers[f_col] *curr_count[f_col]) / \
                    (curr_count[f_row] + curr_count[f_col]))
                # this was the regular average
                #next_fibers.append((curr_fibers[f_row] + curr_fibers[f_col])/2)
                next_count.append(curr_count[f_row] + curr_count[f_col])
                number_averages += 1
                #next_indices.append(list([curr_indices[f_row], curr_indices[f_col]]))
                next_indices.append(list(curr_indices[f_row] + curr_indices[f_col]))
                
            # increment for the loop
            idx += 1
            pair_idx = pair_order[idx]

        # copy through any unvisited (already converged) fibers
        unvisited = numpy.nonzero(done==0)[0]
        for fidx in unvisited:
            next_fibers.append(curr_fibers[fidx])
            next_count.append(curr_count[fidx])
            next_indices.append(curr_indices[fidx])
            
        # set up for next iteration
        curr_fibers = next_fibers
        curr_count = next_count
        curr_indices = next_indices
        iteration_count += 1

        # set up array for next iteration distance computation
        current_fiber_array = fibers.FiberArray()    
        current_fiber_array.number_of_fibers = len(curr_fibers)
        current_fiber_array.points_per_fiber = points_per_fiber
        dims = [current_fiber_array.number_of_fibers, current_fiber_array.points_per_fiber]
        # fiber data
        current_fiber_array.fiber_array_r = numpy.zeros(dims)
        current_fiber_array.fiber_array_a = numpy.zeros(dims)
        current_fiber_array.fiber_array_s = numpy.zeros(dims)
        curr_fidx = 0
        for curr_fib in curr_fibers:
            current_fiber_array.fiber_array_r[curr_fidx] = curr_fib.r
            current_fiber_array.fiber_array_a[curr_fidx] = curr_fib.a
            current_fiber_array.fiber_array_s[curr_fidx] = curr_fib.s
            curr_fidx += 1

        print "<filter.py> SUM FIBER COUNTS:", numpy.sum(numpy.array(curr_count)), "SUM DONE FIBERS:", numpy.sum(done)
        print "<filter.py> MAX COUNT:" , numpy.max(numpy.array(curr_count)), "AVGS THIS ITER:", number_averages

    # when converged, convert output to polydata
    outpd = current_fiber_array.convert_to_polydata()

    # color output by the number of fibers that each output fiber corresponds to
    outcolors = vtk.vtkFloatArray()
    outcolors.SetName('FiberTotal')
    for count in curr_count:
        outcolors.InsertNextTuple1(count)
    outpd.GetCellData().SetScalars(outcolors)

    # also color the input pd by output cluster number
    cluster_numbers = numpy.zeros(original_number_of_fibers)
    cluster_count = numpy.zeros(original_number_of_fibers)
    cluster_idx = 0
    for index_list in curr_indices:
        indices = numpy.array(index_list).astype(int)
        cluster_numbers[indices] = cluster_idx
        cluster_count[indices] = curr_count[cluster_idx]
        cluster_idx += 1
    outclusters =  vtk.vtkFloatArray()
    outclusters.SetName('ClusterNumber')
    for cluster in cluster_numbers:
        outclusters.InsertNextTuple1(cluster)
    inpd.GetCellData().AddArray(outclusters)
    inpd.GetCellData().SetActiveScalars('ClusterNumber')

    return outpd, numpy.array(curr_count), inpd, cluster_numbers, cluster_count
Exemple #35
0
def strat_minpath(sample_id,
                  strat_input,
                  minpath_map,
                  out_dir,
                  pathway_db,
                  gap_fill=True,
                  per_sequence_contrib=False,
                  print_opt=False,
                  proc=1):
    '''Read in sample_id, gene family table, and out_dir, and run MinPath based
    on the gene family abundances. Returns both unstratified and stratified
    pathway abundances as dictionaries in a list. Will compute the simplistic
    "community-wide contributions" for stratitifed output unless 
    per_sequence_contrib=True, which in contrast will cause MinPath to be run
    for each sequence. Also returns the coverage of each unstratified pathway
    as the a different dictionary in a list when per_sequence_contrib=True.'''

    # Get gene family abundances summed over all sequences for this sample.
    unstrat_input = strat_to_unstrat_counts(strat_input)

    pathways_present, reaction_abun = minpath_wrapper(sample_id, unstrat_input,
                                                      minpath_map, out_dir,
                                                      print_opt)

    # Initialize series and dataframe that will contain pathway abundances and
    # coverage scores.
    unstrat_abun = pd.Series()
    unstrat_cov = pd.Series()
    strat_abun = pd.Series()
    strat_cov = pd.Series()

    # Return empty series if no pathways are present.
    if len(pathways_present) == 0:
        return ([unstrat_abun, unstrat_cov, pd.Series(), pd.Series()])

    # Get median reaction/gene family abundance for sample, which is used for
    # calculating coverage.
    median_abun = calc_median_reaction_abun(reaction_abun, pathways_present,
                                            pathway_db)

    # Loop through all pathways present and get abundance and coverage.
    for pathway in pathways_present:

        # Get ALL reactions in pathway (which could include optional ones).
        reactions = pathway_db.find_reactions(pathway)

        # Get abundances of all of these reactions.
        path_reaction_abun = {
            reaction_id: reaction_abun[reaction_id]
            for reaction_id in reactions
        }

        # Get pathway abundance and coverage
        pathway_abun, pathway_cov = pathway_abun_and_coverage(
            pathway, pathway_db, path_reaction_abun, median_abun)

        if pathway_abun == 0:
            continue

        # Add these values to each respective pandas Series.
        unstrat_abun[pathway] = pathway_abun
        unstrat_cov[pathway] = pathway_cov

        if not per_sequence_contrib:
            # If --per_sequence_contrib not set then get stratified pathway
            # abundances simply by weighting community-wide pathway abundances
            # by the abundances of all the predicted abundances of reactions in
            # these pathways contributed by each sequence (i.e. predicted
            # genome)

            strat_path_abun = path_abun_weighted_by_seq(
                strat_input, reactions, sum(list(path_reaction_abun.values())),
                unstrat_abun[pathway], pathway)

            strat_abun = pd.concat([strat_abun, strat_path_abun])

    if per_sequence_contrib:

        # Loop over all sequences and get pathway abundances and coverages
        # for each sequence individually. This step will be run in parallel if
        # possible.

        strat_seq_out = Parallel(n_jobs=proc)(delayed(unstrat_minpath_for_seq)(
            seq, sample_id, strat_input[strat_input['sequence'] == seq].copy(),
            minpath_map, out_dir, pathway_db, gap_fill, print_opt, "_" +
            seq) for seq in set(strat_input['sequence']))

        # Parse out the per-seq abundance and coverage outputs into different
        # lists.
        seq_strat_abun = []
        seq_strat_cov = []

        for seq_out in strat_seq_out:

            seq_strat_abun.append(seq_out[0])
            seq_strat_cov.append(seq_out[1])

        # Concatenate these per-sequence values to the stratified series.
        strat_abun = pd.concat(seq_strat_abun)

        strat_cov = pd.concat(seq_strat_cov)

    # Return unstratified and stratified abundances and coverage scores.
    return ([unstrat_abun, unstrat_cov, strat_abun, strat_cov])
test_acc_list = []
for i in range(iter):
    X_train = np.load('final_train_binarydata_' + str(i) + '.npy')
    Y_train = np.load('final_train_labels_' + str(i) + '.npy')

    X_train = X_train.astype('float')
    X_train = normalize(X_train)
    Y_train = Y_train.astype('float')
    Y_train = Y_train.astype(int)

    randomCombinations = random.sample(list(ParameterGrid(grid)), numSamples)

    print("parallel loop started")

    r = Parallel(n_jobs=-2, verbose=10)(
        delayed(Stratified_kfold)(X_train, Y_train, combination)
        for combination in randomCombinations)
    combination, score, train_acc, test_acc = zip(*r)

    combination_list = list(combination)

    score_list = list(score)
    trainacclist = list(train_acc)
    testacclist = list(test_acc)

    req_idx = score_list.index(max(score_list))
    train_acc_list.append(trainacclist[req_idx])
    test_acc_list.append(testacclist[req_idx])
    bestparamdict[str(i)] = combination_list[req_idx]

print('Train acc = ' + str(sum(train_acc_list) / iter))
def gen_data(lst_type, img_lst_patients, img_lst_candidates,
             gen_candidates_json, resample_lungs_json, n_candidates,
             crop_raw_scan_buffer, new_data_type, new_candidates_shape_zyx,
             new_spacing_zyx):
    n_threads = pipe.n_CPUs
    n_junks = int(np.ceil(len(img_lst_patients) / n_threads))
    pipe.log.info('processing ' + str(n_junks) + ' junks with ' +
                  str(n_threads) + ' patients each')
    HU_tissue_range = pipe.load_json('params.json',
                                     'resample_lungs')['HU_tissue_range']
    n_candidates_gen = pipe.load_json('params.json',
                                      'gen_candidates')['n_candidates']
    cand_line_num = 0
    for junk_cnt in range(n_junks):
        junk = []
        for in_junk_cnt in range(n_threads):
            line_num = n_threads * junk_cnt + in_junk_cnt
            if line_num >= len(img_lst_patients):
                break
            junk.append(line_num)
        pipe.log.info('processing junk ' + str(junk_cnt))
        # heterogenous spacing -> homogeneous spacing
        junk_lst = Parallel(n_jobs=min([n_threads, len(junk)]))(
            delayed(gen_patients_candidates)
            (line_num, img_lst_patients, gen_candidates_json,
             resample_lungs_json, n_candidates, crop_raw_scan_buffer,
             new_data_type, new_candidates_shape_zyx, new_spacing_zyx,
             HU_tissue_range) for line_num in junk)
        for junk_result in junk_lst:
            patient, patient_label, images, prob_maps = junk_result
            # take n_candidates or less
            images = np.array(images, dtype=np.int16)[:n_candidates]
            prob_maps = np.array(prob_maps, dtype=np.uint8)[:n_candidates]
            if new_data_type == 'uint8':
                images = (images /
                          (float(HU_tissue_range[1] - HU_tissue_range[0])) *
                          255).astype(np.uint8)  # [0, 255]
            elif new_data_type == 'float32':
                images = (images /
                          (float(HU_tissue_range[1] - HU_tissue_range[0])) -
                          0.25).astype(np.float32)  # [-0.25, 0.75]
                prob_maps = (prob_maps / 255).astype(np.float32)  # [0.0, 1.0]
            images_and_prob_maps = np.concatenate([images, prob_maps],
                                                  axis=4).astype(new_data_type)
            path = pipe.save_array(patient + '.npy', images_and_prob_maps)
            with open(pipe.get_step_dir() + lst_type + '_patients.lst',
                      'a') as f:
                f.write('{}\t{}\t{}\n'.format(patient, patient_label,
                                              os.path.abspath(path)))
            if pipe.dataset_name == 'LUNA16':
                with open(pipe.get_step_dir() + lst_type + '_candidates.lst',
                          'a') as f:
                    for cnt in range(images.shape[0]):
                        cand = patient + '_' + str(cnt)
                        img_lst_candidates[img_lst_candidates[0] ==
                                           cand][1].values.tolist()
                        cand_label = img_lst_candidates[
                            img_lst_candidates[0] == cand][1].values.tolist()
                        if len(cand_label) == 0:
                            cand_label = 0
                        else:
                            cand_label = cand_label[0]
                        if not cand.startswith(patient):
                            raise ValueError(cand + ' needs to start with ' +
                                             patient)
                        f.write('{}\t{}\t{}\n'.format(cand, cand_label,
                                                      os.path.abspath(path)))
Exemple #38
0
def main():
    if len(sys.argv) >= 3:
        # read in the top 100 ranks from txt file
        rank100file = sys.argv[1]
        saveDir = sys.argv[2]
        # baseDir = sys.argv[5]
        # if len(sys.argv) < 6:
        #     baseDir = None
        if len(sys.argv) < 5:
            jobName = ""
        else:
            jobName = sys.argv[4]
        if len(sys.argv) < 4:
            numRanks = 100
        else:
            numRanks = int(sys.argv[3])
        if jobName == "" or jobName == " ":
            jobName = "default"
        #     Get the arguments that tell me what job number I am, and how many total jobs there are
        totalTasks = 1
        if len(sys.argv) > 5:
            totalTasks = int(sys.argv[5])
        taskNumber = 0
        if len(sys.argv) > 6:
            taskNumber = int(sys.argv[6])
            if taskNumber >= totalTasks:
                taskNumber = totalTasks - 1
        numCores = 1
        if len(sys.argv) > 7:
            numCores = int(sys.argv[7])
        dumpDir = "./featureDump"
        if len(sys.argv) > 8:
            dumpDir = sys.argv[8]
        SingleFileName = None
        if len(sys.argv) > 9:
            SingleFileName = sys.argv[9]
        saveDir_mapMask = os.path.join(saveDir, jobName + "_MapMasks")
        saveDir_mask = os.path.join(saveDir, jobName + "_Masks")
        saveDir_other = os.path.join(saveDir, jobName + "_otherData")
        try:
            os.makedirs(saveDir_mapMask)
        except:
            None
        try:
            os.makedirs(saveDir_mask)
        except:
            None
        try:
            os.makedirs(saveDir_other)
        except:
            None
        resultsDictionary = {}
        # Load all rank files (each row contains a probe and its top-N rank results)
        with open(rank100file) as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        # determine the row indexes of which probe/top N result pairs I want to calculate
        myPartitionSize = int(len(content) / totalTasks)
        myPartitionRangeStart = taskNumber * myPartitionSize
        myPartitionRangeEnd = myPartitionRangeStart + myPartitionSize

        for c in content:
            carr = c.split(",")
            resultsDictionary[carr[0]] = carr[1:]
        sortedKeys = resultsDictionary.keys()
        # Sort keys so we know each partition is always working with the same ordered set to partition from
        sortedKeys = sorted(sortedKeys)
        # sortedKeys = ["/afs/crc.nd.edu/group/cvrl/scratch_18/medifor/evaluation/NC2017_evaluation_all/world200_9/f7894ef3d96c0767ba23783d66b1e298.jpg"]
        if numCores > 1:
            Parallel(n_jobs=4)(
                delayed(outer_generateMaskForProbe)
                (p, resultsDictionary, numRanks, saveDir_mask, saveDir_mapMask,
                 saveDir_other)
                for p in sortedKeys[myPartitionRangeStart:myPartitionRangeEnd])
        else:
            for p in sortedKeys[myPartitionRangeStart:myPartitionRangeEnd]:
                # if os.path.basename(p) == "bbfb07e272b66a6be65ca87e20908e53.jpg":
                if os.path.basename(
                        p) == "170303979309eebf5a92c492a84997f6.jpg":
                    outer_generateMaskForProbe(p, resultsDictionary, numRanks,
                                               saveDir_mask, saveDir_mapMask,
                                               saveDir_other)
        # for p in sortedKeys:
        #     # if os.path.basename(p) == "8b3c9021c7e6dda308cfe7c594dc79e4.jpg":#"c59a64fb6a8f26cdbc15f3408c43ed26.jpg" or True:#"173e754519ea142944dab8c686efa7b3.jpg":
        #     results = resultsDictionary[p]
        #     finalMapMask, finalMask = genMasksForProbe(p, results,numRanks)
        #     savePath_mask = os.path.join(saveDir_mask, os.path.basename(p))
        #     savePath_mapmask = os.path.join(saveDir_mapMask, os.path.basename(p))
        #     cv.imwrite(savePath_mapmask, finalMapMask)
        #     cv.imwrite(savePath_mask, finalMask)

    else:
        print(
            "usage: BuildMasks.py <rankFile> <save Dir> <Number of Ranks=100> <jobname=default> <Total Number of Jobs1=> <Current Job Number=0> <number of cores = 1> <dataDump Directory= ./datadump> "
        )
Exemple #39
0
def indep_pairwise(X, window_size, step_size, threshold, verbose=True):
    r"""Determine pair-wise independent variants.

    Independent variants are defined via squared Pearson correlations between
    pairs of variants inside a sliding window.

    Parameters
    ----------
    X : array_like
        Sample by variants matrix.
    window_size : int
        Number of variants inside each window.
    step_size : int
        Number of variants the sliding window skips.
    threshold : float
        Squared Pearson correlation threshold for independence.
    verbose : bool
        `True` for progress information; `False` otherwise.

    Returns
    -------
    ok : boolean array defining independent variants

    Examples
    --------
    .. doctest::

        >>> from numpy.random import RandomState
        >>> from limix.qc import indep_pairwise
        >>>
        >>> random = RandomState(0)
        >>> X = random.randn(10, 20)
        >>>
        >>> indep_pairwise(X, 4, 2, 0.5, verbose=False)
        array([ True,  True, False,  True,  True,  True,  True,  True,  True,
                True,  True,  True,  True,  True,  True,  True,  True,  True,
                True,  True])
    """
    from joblib import Parallel, delayed
    from tqdm import tqdm
    from ..threads import get_max_nthreads
    from numpy import ascontiguousarray, logical_not, zeros

    left = 0
    excls = zeros(X.shape[1], dtype=bool)

    if step_size > window_size:
        raise ValueError("Window size has to be smaller than step size.")

    n = (X.shape[1] + step_size) // step_size

    steps = list(range(n))
    cc = get_max_nthreads()

    with tqdm(total=n, desc="Indep. pairwise", disable=not verbose) as pbar:

        while len(steps) > 0:
            i = 0
            right = 0
            delayeds = []
            while i < len(steps):

                step = steps[i]
                left = step * step_size
                if left < right:
                    i += 1
                    continue

                del steps[i]
                right = min(left + window_size, X.shape[1])
                x = ascontiguousarray(X[:, left:right].T)

                delayeds.append(delayed(_func)(x, excls[left:right], threshold))
                if len(delayeds) == cc:
                    Parallel(n_jobs=min(len(delayeds), cc), backend="threading")(
                        delayeds
                    )
                    pbar.update(len(delayeds))
                    delayeds = []

            if len(delayeds) == 0:
                continue

            Parallel(n_jobs=min(len(delayeds), cc), backend="threading")(delayeds)
            pbar.update(len(delayeds))

    return logical_not(excls)
Exemple #40
0
def run_minpath_pipeline(inputfile,
                         mapfile,
                         out_dir,
                         proc=1,
                         regroup_mapfile=None,
                         gap_fill=True,
                         per_sequence_contrib=False,
                         print_cmds=False):
    '''Pipeline containing full pipeline for reading input files, making
    calls to functions to run MinPath and calculate pathway abundances and
    coverages. Will return 3 output Pandas dataframes: (1) unstratified pathway
    abundances, (2) unstratified pathway coverages, and (3) stratified pathway
    abundances.'''

    # Read in table of gene family abundances and determine if in stratified
    # format or not.
    in_metagenome, strat_format = read_metagenome_input(inputfile)

    # Remove 'description' column if it exists.
    if "description" in in_metagenome.columns:
        in_metagenome.drop("description", axis=1, inplace=True)

    # Get list of sample ids.
    samples = [
        col for col in in_metagenome.columns
        if col not in ["function", "sequence"]
    ]

    # Initialize reactions to be empty unless regroup mapfile given.
    reactions = []

    # Regroup functions in input table to different ids if regroup mapfile is
    # provided.
    if regroup_mapfile:
        reactions = read_reaction_names(regroup_mapfile)

        in_metagenome = regroup_func_ids(in_metagenome, strat_format,
                                         regroup_mapfile, proc)
        regrouped_outfile = path.join(out_dir, "regrouped_infile.tsv")
        in_metagenome.to_csv(path_or_buf=regrouped_outfile,
                             sep="\t",
                             index=False)

    # Read in pathway structures.
    pathways_in = PathwaysDatabase(database=mapfile, reaction_names=reactions)

    # Write out mapfile with all structure removed.
    minpath_mapfile = path.join(out_dir, "parsed_mapfile.tsv")
    with open(minpath_mapfile, "w") as out_map:
        out_map.write(pathways_in.get_database())

    # Subset input table of reactions to only those found in pathway database.
    in_metagenome = in_metagenome[in_metagenome.function.isin(
        pathways_in.reaction_list())]

    # Run minpath wrapper on all samples if table is stratified. Note that
    # input stratified table is subsetted to required columns only.
    if strat_format:

        if per_sequence_contrib:
            # If running MinPath on each sequence individually then that will be
            # step parallelized (so each sample will be looped over one-by-one
            # instead).
            path_abun_raw = []
            for sample_id in samples:
                path_abun_raw.append(
                    strat_minpath(
                        sample_id,
                        in_metagenome[["function", "sequence", sample_id]],
                        minpath_mapfile, out_dir, pathways_in, gap_fill,
                        per_sequence_contrib, print_cmds, proc))

        else:
            # Parallelize this step if not going to run MinPath for each
            # sequence individually.
            path_abun_raw = Parallel(n_jobs=proc)(delayed(strat_minpath)(
                sample_id, in_metagenome[["function", "sequence", sample_id
                                          ]], minpath_mapfile, out_dir,
                pathways_in, gap_fill, per_sequence_contrib, print_cmds, 1)
                                                  for sample_id in samples)

        # Split the output into unstratified and stratified.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []
        path_raw_abun_strat = []
        path_raw_cov_strat = []

        for sample_output in path_abun_raw:
            path_raw_abun_unstrat += [sample_output[0]]
            path_raw_cov_unstrat += [sample_output[1]]
            path_raw_abun_strat += [sample_output[2]]
            path_raw_cov_strat += [sample_output[3]]

        # Prep output dfs.
        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)
        path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                               num_digits=10)
        path_abun_strat = prep_pathway_df_out(path_raw_abun_strat,
                                              strat_index=True)

        # Also parse stratified coverage table if --per_sequence_contrib set.
        path_cov_strat = None
        if per_sequence_contrib:
            path_cov_strat = prep_pathway_df_out(path_raw_cov_strat,
                                                 strat_index=True,
                                                 num_digits=10)

            path_cov_strat.columns = ["pathway", "sequence"] + samples

        # Set column labels of unstratified dataframe to be sample names.
        path_abun_unstrat.columns = samples
        path_cov_unstrat.columns = samples
        path_abun_strat.columns = ["pathway", "sequence"] + samples

        return (path_abun_unstrat, path_cov_unstrat, path_abun_strat,
                path_cov_strat)

    # Otherwise the data is in unstratified format, which is more straight-
    # forward to process.
    else:
        path_raw_unstrat = Parallel(n_jobs=proc)(delayed(unstrat_minpath)(
            sample_id, in_metagenome[["function", sample_id]], minpath_mapfile,
            out_dir, pathways_in, gap_fill, print_cmds)
                                                 for sample_id in samples)

        # Prep output df.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []

        for sample_output in path_raw_unstrat:
            path_raw_abun_unstrat += [sample_output[0]]
            path_raw_cov_unstrat += [sample_output[1]]

        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)
        path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                               num_digits=10)

        # Set column labels of unstratified dataframe to be sample names.
        path_abun_unstrat.columns = samples
        path_cov_unstrat.columns = samples

        return (path_abun_unstrat, path_cov_unstrat, None, None)
Exemple #41
0
 def cal_factors(self, start, end, n_jobs):
     net_profit_Q = self.influx.getDataMultiprocess(
         'FinancialReport_Gus', 'net_profit_Q', start, end,
         ['code', 'net_profit_Q', 'report_period'])
     net_profit_TTM = self.influx.getDataMultiprocess(
         'FinancialReport_Gus', 'net_profit_TTM', start, end,
         ['code', 'net_profit_TTM', 'report_period'])
     net_profit_ddt_TTM = self.influx.getDataMultiprocess(
         'FinancialReport_Gus', 'net_profit_ddt_TTM', start, end,
         ['code', 'net_profit_ddt_TTM', 'report_period'])
     market_cap = self.influx.getDataMultiprocess('DailyFactors_Gus',
                                                  'Size', start, end,
                                                  ['code', 'market_cap'])
     net_profit_Q.index.names = ['date']
     net_profit_Q.reset_index(inplace=True)
     net_profit_TTM.index.names = ['date']
     net_profit_TTM.reset_index(inplace=True)
     net_profit_ddt_TTM.index.names = ['date']
     net_profit_ddt_TTM.reset_index(inplace=True)
     market_cap.index.names = ['date']
     market_cap.reset_index(inplace=True)
     # ----------------------------------------------------------
     EP_Q = pd.merge(net_profit_Q, market_cap, on=['date', 'code'])
     EP_Q.set_index('date', inplace=True)
     EP_Q['EP_Q'] = EP_Q['net_profit_Q'] / EP_Q['market_cap'] / 10000
     EP_Q = EP_Q.loc[:, ['code', 'EP_Q', 'report_period']]
     EP_Q = EP_Q.dropna(subset=['EP_Q'])
     codes = EP_Q['code'].unique()
     split_codes = np.array_split(codes, n_jobs)
     with parallel_backend('multiprocessing', n_jobs=n_jobs):
         res = Parallel()(delayed(influxdbData.JOB_saveData)(
             EP_Q, 'code', codes, self.db, 'EP_Q') for codes in split_codes)
     print('EP_Q finish')
     print('-' * 30)
     fail_list = []
     for r in res:
         fail_list.extend(r)
     # ----------------------------------------------------------
     # market_cap 的单位为万元
     EP = pd.merge(net_profit_TTM, market_cap, on=['date', 'code'])
     EP.set_index('date', inplace=True)
     EP['EP_TTM'] = EP['net_profit_TTM'] / EP['market_cap'] / 10000
     EP = EP.loc[:, ['code', 'EP_TTM', 'report_period']]
     EP = EP.dropna(subset=['EP_TTM'])
     codes = EP['code'].unique()
     split_codes = np.array_split(codes, n_jobs)
     with parallel_backend('multiprocessing', n_jobs=n_jobs):
         res = Parallel()(delayed(influxdbData.JOB_saveData)(
             EP, 'code', codes, self.db, 'EP') for codes in split_codes)
     print('EP_TTM finish')
     print('-' * 30)
     fail_list = []
     for r in res:
         fail_list.extend(r)
     # ----------------------------------------------------------
     EPcut = pd.merge(net_profit_ddt_TTM, market_cap, on=['date', 'code'])
     EPcut.set_index('date', inplace=True)
     EPcut['EPcut_TTM'] = EPcut['net_profit_ddt_TTM'] / EPcut[
         'market_cap'] / 10000
     EPcut = EPcut.loc[:, ['code', 'EPcut_TTM', 'report_period']]
     EPcut = EPcut.dropna(subset=['EPcut_TTM'])
     codes = EPcut['code'].unique()
     split_codes = np.array_split(codes, n_jobs)
     with parallel_backend('multiprocessing', n_jobs=n_jobs):
         res = Parallel()(delayed(influxdbData.JOB_saveData)(
             EPcut, 'code', codes, self.db, 'EPcut')
                          for codes in split_codes)
     print('EPcut_TTM finish')
     print('-' * 30)
     for r in res:
         fail_list.extend(r)
     return fail_list
    def nuscenes_gt_to_kitti(
        self,
        lyft_dataroot: str,
        table_folder: str,
        lidar_name: str = "LIDAR_TOP",
        get_all_detections: bool = False,
        parallel_n_jobs: int = 4,
        samples_count: Optional[int] = None,
    ) -> None:
        """Converts nuScenes GT formatted annotations to KITTI format.

        Args:
            lyft_dataroot: folder with tables (json files).
            table_folder: folder with tables (json files).
            lidar_name: Name of the lidar sensor.
                Only one lidar allowed at this moment.
            get_all_detections: If True, will write all
                bboxes in PointCloud and use only FrontCamera.
            parallel_n_jobs: Number of threads to parralel processing.
            samples_count: Number of samples to convert.

        """
        self.lyft_dataroot = lyft_dataroot
        self.table_folder = table_folder
        self.lidar_name = lidar_name
        self.get_all_detections = get_all_detections
        self.samples_count = samples_count
        self.parallel_n_jobs = parallel_n_jobs

        # Select subset of the data to look at.
        self.lyft_ds = LyftDataset(self.lyft_dataroot, self.table_folder)

        self.kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi)
        self.kitti_to_nu_lidar_inv = self.kitti_to_nu_lidar.inverse

        # Get assignment of scenes to splits.
        split_logs = [
            self.lyft_ds.get("log", scene["log_token"])["logfile"]
            for scene in self.lyft_ds.scene
        ]
        if self.get_all_detections:
            self.cams_to_see = ["CAM_FRONT"]
        else:
            self.cams_to_see = [
                "CAM_FRONT",
                "CAM_FRONT_LEFT",
                "CAM_FRONT_RIGHT",
                "CAM_BACK",
                "CAM_BACK_LEFT",
                "CAM_BACK_RIGHT",
            ]

        # Create output folders.
        self.label_folder = self.store_dir.joinpath("label_2")
        self.calib_folder = self.store_dir.joinpath("calib")
        self.image_folder = self.store_dir.joinpath("image_2")
        self.lidar_folder = self.store_dir.joinpath("velodyne")
        for folder in [
                self.label_folder, self.calib_folder, self.image_folder,
                self.lidar_folder
        ]:
            if not folder.is_dir():
                folder.mkdir(parents=True)

        # Use only the samples from the current split.
        sample_tokens = self._split_to_samples(split_logs)
        if self.samples_count is not None:
            sample_tokens = sample_tokens[:self.samples_count]

        with parallel_backend("threading", n_jobs=self.parallel_n_jobs):
            Parallel()(delayed(self.process_token_to_kitti)(sample_token)
                       for sample_token in tqdm(sample_tokens))
def run_stacked(data, stacked_keys, repeat_idx, drop_na):
    out_scores = pd.DataFrame()
    out_predictions = data.copy()
    for key, sel in stacked_keys.items():
        this_data = data[sel]
        if drop_na == 'local':
            mask = this_data.dropna().index
        elif drop_na == 'global':
            mask = data.dropna().index
        else:
            mask = this_data.index
        X = this_data.loc[mask].values
        y = data['age'].loc[mask].values
        fold_idx = data.loc[mask]['fold_idx'].values

        if drop_na is False:
            # code missings to make the tress learn from it.
            X_left = X.copy()
            X_left[this_data.isna().values] = -1000
            X_right = X.copy()
            X_right[this_data.isna().values] = 1000
            assert np.sum(np.isnan(X_left)) == 0
            assert np.sum(np.isnan(X_right)) == 0
            assert np.min(X_left) == -1000
            assert np.max(X_right) == 1000
            X = np.concatenate([X_left, X_right], axis=1)

        for column in sel:
            score = get_mae(data.loc[mask], column)
            if column not in out_scores:
                out_scores[column] = score
            elif out_scores[column].mean() < np.mean(score):
                out_scores[column] = score

        unstacked = out_scores[sel].values
        idx = unstacked.mean(axis=0).argmin()
        unstacked_mean = unstacked[:, idx].mean()
        unstacked_std = unstacked[:, idx].std()
        print(f'{key} | best unstacked MAE: {unstacked_mean} '
              f'(+/- {unstacked_std}')

        print('n =', len(X))

        param_grid = {'max_depth': [4, 6, 8, None]}
        if X.shape[1] > 10:
            param_grid['max_features'] = (['log2', 'sqrt', None])

        reg = GridSearchCV(RandomForestRegressor(n_estimators=1000,
                                                 random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           iid=False,
                           cv=5)
        if DEBUG:
            reg = RandomForestRegressor(n_estimators=1000,
                                        max_features='log2',
                                        max_depth=6,
                                        random_state=42)

        cv = LeaveOneGroupOut()
        out_cv = Parallel(n_jobs=1)(
            delayed(fit_predict_score)(
                estimator=reg,
                X=X,
                y=y,
                train=train,
                test=test,
                test_index=this_data.loc[mask].index[test])
            for train, test in cv.split(X, y, fold_idx))

        out_cv = zip(*out_cv)
        predictions = next(out_cv)
        out_predictions[f'stacked_{key}'] = np.nan
        for pred in predictions:
            assert np.all(out_predictions.loc[pred.index]['age'] == pred['y'])
            out_predictions.loc[pred.index,
                                f'stacked_{key}'] = pred['prediction'].values
        scores = np.array(next(out_cv))
        print(f'{key} | MAE : %0.3f (+/- %0.3f)' %
              (np.mean(scores), np.std(scores)))

        out_scores[key] = scores
    out_scores['repeat_idx'] = repeat_idx
    out_predictions['repeat_idx'] = repeat_idx
    return out_scores, out_predictions
Exemple #44
0
    def fit(self, X, y, sample_weight=None):
        """Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training vectors, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        y : array-like of shape (n_samples,)
            Target values.

        sample_weight : array-like of shape (n_samples,) or default=None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

            .. versionchanged:: 0.23
               when not None, `sample_weight` is passed to all underlying
               estimators

        Returns
        -------
        self : object
        """
        # all_estimators contains all estimators, the one to be fitted and the
        # 'drop' string.
        names, all_estimators = self._validate_estimators()
        self._validate_final_estimator()

        stack_method = [self.stack_method] * len(all_estimators)

        # Fit the base estimators on the whole training data. Those
        # base estimators will be used in transform, predict, and
        # predict_proba. They are exposed publicly.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
            for est in all_estimators if est != 'drop')
        self.n_features_in_ = self.estimators_[0].n_features_in_

        self.named_estimators_ = Bunch()
        est_fitted_idx = 0
        for name_est, org_est in zip(names, all_estimators):
            if org_est != 'drop':
                self.named_estimators_[name_est] = self.estimators_[
                    est_fitted_idx]
                est_fitted_idx += 1
            else:
                self.named_estimators_[name_est] = 'drop'

        # To train the meta-classifier using the most data as possible, we use
        # a cross-validation to obtain the output of the stacked estimators.

        # To ensure that the data provided to each estimator are the same, we
        # need to set the random state of the cv if there is one and we need to
        # take a copy.
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
        if hasattr(cv, 'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()

        self.stack_method_ = [
            self._method_name(name, est, meth)
            for name, est, meth in zip(names, all_estimators, stack_method)
        ]
        fit_params = ({
            "sample_weight": sample_weight
        } if sample_weight is not None else None)
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(clone(est),
                                       X,
                                       y,
                                       cv=deepcopy(cv),
                                       method=meth,
                                       n_jobs=self.n_jobs,
                                       fit_params=fit_params,
                                       verbose=self.verbose)
            for est, meth in zip(all_estimators, self.stack_method_)
            if est != 'drop')

        # Only not None or not 'drop' estimators will be used in transform.
        # Remove the None from the method as well.
        self.stack_method_ = [
            meth for (meth, est) in zip(self.stack_method_, all_estimators)
            if est != 'drop'
        ]

        X_meta = self._concatenate_predictions(X, predictions)
        _fit_single_estimator(self.final_estimator_,
                              X_meta,
                              y,
                              sample_weight=sample_weight)

        return self
Exemple #45
0
def main():
    args = get_args()
    helper.print_script_args_and_info(args)
    Parallel(n_jobs=args.n_jobs)(
        delayed(process_graph_cache_file)(graph_cache_file, args)
        for graph_cache_file in dataset_helper.get_all_cached_graph_datasets())
    return kernel.evaluate(np.vstack([values[0][j], values[1][j]]))[0]*fator

def recuperaArrayPDF(kernel, values, estados):
	lst = {}
	for j in range(tam):
		PDF = kernel.evaluate(np.vstack([values[0][j], values[1][j]]))[0]*fator
		nomeEstado = str(estados[j])
		try:
			lst[nomeEstado] = (PDF+lst[nomeEstado])
		except KeyError:
			lst[nomeEstado] = PDF
	return lst

if allowParallelKDEProcessing:
    #Paralelo
    if tam < 1000:
    	numThreads = 1
    else:
    	numThreads = tam/1000
    CDFs = Parallel(n_jobs=numThreads, backend="threading")(delayed(recuperaArrayPDFParalelo)(j) for j in limite)
else:
    #Sequencial
    CDFs = recuperaArrayPDF(kernel, values, uf)

if allowRedisCaching:
    #grava a nova consulta no redis (cache)
    redis.set(chave, CDFs)

#resultado enviado
print CDFs
        temp += [m[i]]

    if average_type == 'mean':
        temp = scipy.sparse.vstack(temp).mean(axis=0)
    elif average_type == 'gmean':
        temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

    temp[temp < 1e-6] = 0

    return file_to_md5[image_name], csr_matrix(temp)


result_path = Path('data') / 'prediction' / 'global'
result_path.mkdir(exist_ok=True, parents=True)

result = Parallel(n_jobs=12)(delayed(get_probs)(i) for i in file_names.index)
#
# result = [get_probs(i) for i in tqdm(file_names.index)]

print('[{}] Unzippping...'.format(str(datetime.datetime.now())))

pred_md5_list, probs = zip(*result)

probs = vstack(probs)

labels = pd.DataFrame({'md5': pred_md5_list})

print('[{}] Saving labels...'.format(str(datetime.datetime.now())))

labels.to_csv(str(result_path / (average_type + '_last_md5_list.csv')),
              index=False)
    def fit(self, modality, ground_truth=None, cat=None):
        """Compute the images images.

        Parameters
        ----------
        modality : object of type TemporalModality
            The modality object of interest.

        ground-truth : object of type GTModality or None
            The ground-truth of GTModality. If None, the whole data will be
            considered.

        cat : str or None
            String corresponding at the ground-truth of interest. Cannot be
            None if ground-truth is not None.

        Return
        ------
        self : object
             Return self.

        """
        super(HaralickExtraction, self).fit(modality=modality,
                                              ground_truth=ground_truth,
                                              cat=cat)

        # Get the data and rescale as integers within the given levels
        vol_haralick = ((modality.data_ - np.ndarray.min(modality.data_)) *
                        ((self.levels -1) /
                         (np.ndarray.max(modality.data_) -
                          np.ndarray.min(modality.data_)))).astype(int)

        # Extract the set of patches from the modality data
        patches = extract_patches(vol_haralick, patch_shape=self.patch_size)

        # Allocate the haralick maps, one for each feature that
        # will be computed
        nb_directions = 13
        nb_features = 13
        self.data_ = np.zeros((modality.data_.shape[0],
                               modality.data_.shape[1],
                               modality.data_.shape[2],
                               nb_directions,
                               nb_features))

        # WE NEED TO PARALLELIZE THIS CODE

        # # Extract Haralick feature for each patch
        # # Define the shift to apply
        if isinstance(self.patch_size, tuple):
            y_shift = int(np.ceil((self.patch_size[0] - 1) / 2.))
            x_shift = int(np.ceil((self.patch_size[1] - 1) / 2.))
            z_shift = int(np.ceil((self.patch_size[2] - 1) / 2.))
        elif isinstance(self.patch_size, int):
            y_shift = int(np.ceil((self.patch_size - 1) / 2.))
            x_shift = int(np.ceil((self.patch_size - 1) / 2.))
            z_shift = int(np.ceil((self.patch_size - 1) / 2.))

        # for y in range(patches.shape[0]):
        #     for x in range(patches.shape[1]):
        #         for z in range(patches.shape[2]):
        #             print 'Compute for the pixel at position {}{}{}'.format(
        #                 y, x, z)
        #             # Compute the haralick features
        #             self.data_[y + y_shift,
        #                        x + x_shift,
        #                        z + z_shift, :] = haralick(
        #                            patches[y, x, z, :],
        #                            distance=self.distance)

        # Create the list of indices to process
        yy, xx, zz = np.meshgrid(range(patches.shape[0]),
                                 range(patches.shape[1]),
                                 range(patches.shape[2]))
        # Linearize for fast processing
        yy = yy.reshape(-1)
        xx = xx.reshape(-1)
        zz = zz.reshape(-1)

        # Go for the parallel loop
        haralick_features = Parallel(n_jobs=-1)(delayed(
            _compute_haralick_features)(patches[y, x, z, :], self.distance)
                                                for y, x, z in zip(yy, xx, zz))

        # Convert to numpy array
        haralick_features = np.array(haralick_features)
        # Reshape the feature matrix
        haralick_features = haralick_features.reshape((patches.shape[0],
                                                       patches.shape[1],
                                                       patches.shape[2],
                                                       nb_directions,
                                                       nb_features))
        # Copy the feature into the object
        self.data_[y_shift : -y_shift,
                   x_shift : -x_shift,
                   z_shift : -z_shift] = haralick_features

        return self
Exemple #49
0
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0):
    """ GLM fit for an fMRI data matrix

    Parameters
    ----------
    Y : array of shape (n_time_points, n_voxels)
        The fMRI data.

    X : array of shape (n_time_points, n_regressors)
        The design matrix.

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'.

    bins : int, optional
        Maximum number of discrete bins for the AR(1) coef histogram.

    n_jobs : int, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : int, optional
        The verbosity level. Defaut is 0

    Returns
    -------
    labels : array of shape (n_voxels,),
        A map of values on voxels used to identify the corresponding model.

    results : dict,
        Keys correspond to the different labels values
        values are RegressionResults instances corresponding to the voxels.

    """
    acceptable_noise_models = ['ar1', 'ols']
    if noise_model not in acceptable_noise_models:
        raise ValueError(
            "Acceptable noise models are {0}. You provided "
            "'noise_model={1}'".format(acceptable_noise_models,
                                       noise_model)
        )
    if Y.shape[0] != X.shape[0]:
        raise ValueError('The number of rows of Y '
                         'should match the number of rows of X.'
                         ' You provided X with shape {0} '
                         'and Y with shape {1}'.
                         format(X.shape, Y.shape))

    # Create the model
    ols_result = OLSModel(X).fit(Y)

    if noise_model == 'ar1':
        # compute and discretize the AR1 coefs
        ar1 = (
            (ols_result.residuals[1:]
             * ols_result.residuals[:-1]).sum(axis=0)
            / (ols_result.residuals ** 2).sum(axis=0)
        )
        del ols_result
        ar1 = (ar1 * bins).astype(np.int) * 1. / bins
        # Fit the AR model acccording to current AR(1) estimates
        results = {}
        labels = ar1
        # Parallelize by creating a job per ARModel
        vals = np.unique(ar1)
        ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_ar_model_fit)(X, val, Y[:, labels == val])
            for val in vals)
        for val, result in zip(vals, ar_result):
            results[val] = result
        del vals
        del ar_result

    else:
        labels = np.zeros(Y.shape[1])
        results = {0.0: ols_result}

    return labels, results
Exemple #50
0
        clf = SVC(C=c, kernel='precomputed')
        clf.fit(X_features_trainm, Y[train_indices])
        erfsvm.append(clf.score(X_features_testm, Y[test_indices]))
    testfile.write("RFSVM&%s pm%s & " %
                   (floored_percentage(np.mean(erfsvm), 2),
                    floored_percentage(np.std(erfsvm), 2)) + '\n')
    testfile.write("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2),
                                          floored_percentage(np.std(e8), 2)) +
                   '\n')
    testfile.write(" LATERF&%s pm%s &" %
                   (floored_percentage(np.mean(elaterf), 2),
                    floored_percentage(np.std(elaterf), 2)) + '\n')
    testfile.write(" LATERFDIS&%s pm%s & " %
                   (floored_percentage(np.mean(elaterfdis), 2),
                    floored_percentage(np.std(elaterfdis), 2)) + '\n')
    print(ss)
    print("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2),
                                floored_percentage(np.std(erfsvm), 2)) + '\n')
    print("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2),
                                 floored_percentage(np.std(e8), 2)) + '\n')
    print(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2),
                                 floored_percentage(np.std(elaterf), 2)) +
          '\n')
    print(" LATERFDIS&%s pm%s & " %
          (floored_percentage(np.mean(elaterfdis), 2),
           floored_percentage(np.std(elaterfdis), 2)) + '\n')


if __name__ == '__main__':
    Parallel(n_jobs=4)(delayed(mcode)(ite=i) for i in range(4))
Exemple #51
0
    # Read analysis info file and cd
    ai=functions.read_analysis_info_file(args.analysis_info_file)
    os.chdir(ai['project_location'])    

    #Ncores
    ncores=ai['ncores']

    # Read sample names text file
    sample_names_file=args.sample_names_file
    sampleNames = functions.read_sample_names(sample_names_file)

    # Set input and output directories if not 'rawReads/'
    in_dir= args.in_dir
    out_dir_report= args.out_dir_report
    readType=ai['readType']
    suffix_name=args.suffix_name
    
    # Create tables
    files=functions.get_filepaths(in_dir)
    files = [files[y] for y, x in enumerate(files) if re.findall("fastqc_data.txt", x)] 
    Parallel(n_jobs=8)(delayed(tables)(i) for i in files)
    print "Got data from fastqc output... \n"    

    # Create plots
    functions.make_sure_path_exists(out_dir_report)
    Parallel(n_jobs=8)(delayed(plots)(i) for i in sampleNames)
    print "Made plots per sample... \n"
    os.system('/usr/bin/Rscript bin/create_fastqcPlots_allSamples.R ' + in_dir + ' ' + sample_names_file + ' ' + readType + ' ' + out_dir_report + ' ' + suffix_name + ' ' + args.plot_device)
    print "Made plots all samples... \n"
 def aprun(**tq_args):
     tqdm_f = lambda x, args: tqdm(x, **args)
     return lambda x: Parallel(**joblib_args)(tqdm_f(x, tq_args))
Exemple #53
0
# Do we filter by a base?
if base=="None":
	gene_sets  = gsea.groupby(["Gene_set"]).size().reset_index(name="count")
	print(gene_sets)
	gene_sets  = gene_sets[gene_sets["count"] >= 100] #Changed back to 100!!!
	print(gene_sets)
	# gene_sets  = gene_sets[gene_sets["count"] <= 150]
	# print(gene_sets)
	gene_sets  = [i for i in itertools.combinations(list(gene_sets["Gene_set"]),2)]
	
	if both=="T":
		print("both one-tailed used")
		gene_r    = [(i[1], i[0]) for i in gene_sets]
		gene_sets = gene_sets + gene_r
else:
	base       = pd.read_csv(in_folder + "GSEA_FILES/" + gsea_type + "_gsea_" + base + "_both_" + both + "_pvals", sep="\t")
	gene_sets  = list(set(base["gs"]))
	gene_sets  = [(i.split("$")[0], i.split("$")[1]) for i in gene_sets]

print(gsea_type, len(gene_sets))

main_dict = Parallel(n_jobs=40)(delayed(mann_pval)(i) for i in gene_sets)

print("Done calculating")
# Write to file
main_dict = pd.concat([pd.DataFrame(i) for i in main_dict])
file_out  = in_folder + "GSEA_FILES/" + gsea_type + "_gsea_"+ exp_type + "_both_" + both + "_ext_gmv_" + ext_gmv + "_pvals"

main_dict.to_csv(file_out, sep="\t", header=True, index=False)

print ("Done writing")
                first_page=page_number,
                data_dir="posts",
                verbose=verbose
            )
            for page_number in page_range(last_page=last_page, first_page=first_page)
        )


if __name__ == "__main__":
    # save_posts(last_page=190, first_page=190, data_dir="posts")
    # save_posts_fake(last_page=190, first_page=190, data_dir="posts")

    #parallel_save_post(last_page=190, data_dir="posts", verbose=1)
    #soup = fetch_post_as_soup(13304)
    #soup_article = soup.find("article")
    #html = post_content_from_soup(soup_article).prettify()
    #save_content(html, "test3.html")

    from joblib import Parallel, delayed
    last_page=195

    r = Parallel(n_jobs=10, verbose=10)(
        delayed(save_posts)(
            last_page=page_number,
            first_page=page_number,
            data_dir="posts"
        )
        for page_number in page_range(last_page=last_page)
    )

    filled['cc_{}'.format(sex)] = filled['ccpublic_{}'.format(sex)] + filled['ccprivate_{}'.format(sex)]
    filled['crime_{}'.format(sex)] = filled['crimepublic_{}'.format(sex)] + filled['crimeprivate_{}'.format(sex)]
    filled['health_{}'.format(sex)] = filled['health_private_{}'.format(sex)] + filled['health_public_{}'.format(sex)] 
    filled['transfer_{}'.format(sex)] = filled['inc_trans_pub_{}'.format(sex)] + filled['diclaim_{}'.format(sex)] + filled['ssclaim_{}'.format(sex)] + filled['ssiclaim_{}'.format(sex)]

components = ['inc_labor', 'inc_parent', 'transfer', 'edu', 'crime', 'costs', 'cc', 'health', 'qaly', 'm_ed']
factors = np.arange(0,3.1,0.25)
combo = list(itertools.product(components, factors))

# vary factor: IRR
# applying factor to benefits        

def irr_factors(part, f):
    irr_tmp = deepcopy(filled)
    for sex in ['m', 'f', 'p']:
        irr_tmp['{}_{}'.format(part, sex)] = irr_tmp['{}_{}'.format(part, sex)] * f

    output = irr_calc(irr_tmp, etype=etype, components=components)        

    output['rate'] = f
    output['part'] = part
    
    print 'IRR for {} and factor {} calculated.'.format(part, f)
    return output

irr_factors = Parallel(n_jobs=25)(
	delayed(irr_factors)(part, f) for part, f in combo)
irr_factors = pd.concat(irr_factors, axis=0)
irr_factors.sort_index(inplace=True)
irr_factors.to_csv(os.path.join(plots, 'irr_factors.csv'), index=True)
Exemple #56
0
    def fit(self,
            x,
            grp=[],
            center=False,
            combine=False,
            grpas='******',
            grplen=[],
            display=True,
            n_jobs=-1):
        """Run the model on the matrix of features x

        Args:
            x: array-like
                The features. Dimension [n trials x n features]

        Kargs:
            grp: list of strings, optional, [def: []]
                Group features by using a list of strings. The length of grp must
                be the same as the number of features. If grp is not empty, the
                program will run the feature selection inside each group.

            center: optional, bool, [def: False]
                Normalize fatures with a zero mean by substracting then dividing
                by the mean. The center parameter should be set to True if the
                classifier is a svm.

            combine: boolean, optional, [def: False]
                If a group of features is specified using the grp parameter,
                combine give the access of combining or not groups. For example,
                if there is three unique groups, combining them will compute the mf
                model on each combination : [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]]

            grpas: string, optional, [def: 'single']
                Specify how to consider features inside each group. If the
                parameter grpas ("group as") is:

                    * 'single': inside each combination of group, the features are considered as independant.
                    * 'group': inside each combination of group, the features are going to be associated. So the mf model will search to add a one by one feature, but it will add groups of features.

            grplen: list, optional, [def: []]
                Control the number of combinations by specifying the number of
                elements to associate. If there is three unique groups, all
                possible combinations are : [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]]
                but if grplen is specify, for example grplen=[1,3], this will
                consider combinations of groups only with a length of 1 and 3 and
                remove combinations of 2 elements: [[1],[2],[3],[1,2,3]]

            display: boolean, optional, [def: True]
                Display informations for each step of the mf selection. If n_jobs
                is -1, it is advise to set the display to False.

            n_jobs: integer, optional, [def: -1]
                Control the number of jobs to cumpute the decoding accuracy. If
                n_jobs=-1, all the jobs are used.

        Returns:
            da: list
                The decoding accuracy (da) for each group with the selected number
                of repetitions, which by default is set to 10 (see : cvOut // rep)

            prob: list
                The appearance probability of each feature. The size of prob is the
                same as da.

            groupinfo: pandas Dataframe
                Dataframe to resume the mf feature selection.

        """
        # - Check and get elements sizes:
        y = self._y
        if x.shape[0] != len(y):
            x = x.T
        y = np.ravel(y)
        ntrial, nfeat = x.shape

        # Normalize features :
        if center:
            x_m = np.tile(np.mean(x, 0), (x.shape[0], 1))
            x = (x - x_m) / x_m

        # Combine groups :
        grp_c = combineGroups(grp, nfeat, combine, grpas=grpas, grplen=grplen)
        grp_name, grp_idx = list(grp_c['name']), list(grp_c['idx'])
        ngrp = len(grp_name)

        # - Run the MF model for each combinaition:
        mfdata = Parallel(n_jobs=n_jobs)(
            delayed(_fit)(x, y, grp_c, k, combine, display, self)
            for k in range(len(grp_c)))

        # Get data & complete the Dataframe :
        da, prob, MFstr = zip(*mfdata)
        self.MFstr = MFstr[-1]
        grp_c['da'], grp_c['occurrence'] = [sum(k) / len(k) for k in da], prob

        return da, prob, grp_c
parser.add_argument("-d", "--dstdir", type=str, help="dst image folder")
parser.add_argument("-n", "--n_jobs", type=int, default=2, help="parallel jobs")
parser.add_argument("-p", "--parallel", action='store_true', default=False, help="if parallel")
args = parser.parse_args()

filelist = os.listdir(args.srcdir)


def conv_npz_nrrd(filename):
    print(filename)
    sys.stdout.flush()
    npzpath = os.path.join(args.srcdir, filename)
    filebase = os.path.splitext(filename)[0]
    nrrdbasename = filebase + '.nrrd'
    nrrdpath = os.path.join(args.dstdir, nrrdbasename)

    npz = np.load(npzpath)
    nparr = npz['arr_0']
    print(np.unique(nparr))
    nparr = np.where(nparr == 0, 0, 1).astype(np.uint8)
    print(np.unique(nparr))
    nrrd.write(nrrdpath, nparr)


if args.parallel:
    Parallel(n_jobs=args.n_jobs, backend="multiprocessing")(
        delayed(conv_npz_nrrd)(filename) for filename in filelist)
else:
    for filename in filelist:
        conv_npz_nrrd(filename)
Exemple #58
0
def compute_train_features(data,
                           ts_uid_columns,
                           time_features,
                           lags,
                           window_functions,
                           ignore_const_cols=True,
                           n_jobs=1):
    """
    Parameters
    ----------
    data : pd.DataFrame
        Dataframe with (at least) columns: 'ds' and 'y'.
    ts_uid_columns: list
        List of columns names that are unique identifiers for time series.
    time_features: list
        Time attributes to include as features.
    lags: list
        List of integer lag values to include as features.
    window_functions: list
       List with the definition of the rolling window functions to compute.
    ignore_const_cols: bool
        Specify whether to ignore constant columns.
    n_jobs: int
        Number of jobs to run in parallel when computing the lag/rw features.
    Returns
    ----------
    all_features: pd.Dataframe
        Dataframe containing all the features for the time series.
    """
    # list with all the dataframes of features
    all_features_list = list()
    all_features_list.append(data.reset_index(drop=True))

    # generating the time features
    if len(time_features) > 0:
        input_params = {
            "date_range": pd.DatetimeIndex(data.ds),
            "time_features": time_features,
            "ignore_const_cols": ignore_const_cols
        }
        calendar_features = compute_calendar_features(**input_params)
        all_features_list.append(calendar_features)

    # generating the lag & rolling window features
    if (len(lags) > 0) or (len(window_functions) > 0):
        lag_kwargs = [{"lag": lag} for lag in lags]
        rw_kwargs = [{
            "func_name": window_func[0],
            "func_call": window_func[1],
            "window_shift": window_func[2],
            "window_size": window_func[3]
        } for window_func in window_functions]
        input_kwargs = lag_kwargs + rw_kwargs

        grouped = data.loc[:,
                           ts_uid_columns + ["y"]].groupby(ts_uid_columns)["y"]
        with Parallel(n_jobs=n_jobs) as parallel:
            delayed_func = delayed(compute_lagged_train_feature)
            lagged_features = parallel(
                delayed_func(grouped, **kwargs) for kwargs in input_kwargs)
            lagged_features = pd.DataFrame(
                {feature.name: feature.values
                 for feature in lagged_features})
            all_features_list.append(lagged_features)

    # merging all features
    all_features = pd.concat(all_features_list, axis=1)
    all_features.set_index(data.index, inplace=True)
    return all_features
Exemple #59
0
                ch_p, ch_r, lytf, lypf = 0, 0, 0, 0
                for ytf, ypf in zip(ex.steps_annotation, steps):
                    ch_p += count_hit(ytf, ypf)
                    ch_r += count_hit(ypf, ytf)
                    lytf += len(ytf)
                    lypf += len(ypf)
                if lytf == 0:
                    print('Warning: No steps annotation for', ex.id)
                    ch_r, lytf = 0, 1
                if lypf == 0:
                    ch_p, lypf = 0, 1
                return [ch_r/lytf, ch_p/lypf]

            del_PR = delayed(PR)

            PRS = Parallel(n_jobs=-2)(del_PR(ex) for ex in range(N))
            PRS = np.array([prs for prs in PRS if prs is not None])
            t_batch = time()-t_start
            score = list(PRS.mean(axis=0))
            score += list(PRS.std(axis=0))

            res['results'] += [score]
            res['patterns'] += [patterns]
        results += [res]
        t_batch = time() - t_start
        print('-'*79)
        print('Batch : {:03}/{:03}'.format(simu, n_batch))
        print('Time batch : {:.2f}s'.format(t_batch))

        print('Train: {}, Test: {}'.format(c_train[0], c_test[0]))
        print('Score: {0:.2f}({2:.2f}), {1:.2f} ({3:.2f})'
Exemple #60
0
                if args.metrics:
                    data.GenMetrics(msFile[:-4] + '_metrics.txt')

                print('\nDone processing ' + msFile + '!\n')

                data.Close()

        else:

            num_cores = multiprocessing.cpu_count()

            if int(args.parallel) <= num_cores:
                num_cores = int(args.parallel)

            elif int(args.parallel) > num_cores:
                # if user asks for more cores than exist, default to the maximum
                print(
                    'Specified number of cores for parallelization exceeds ' +
                    'available number of cores. Maximum will be used.')

            Parallel(n_jobs=num_cores)(
                delayed(func)(msFile=msFile,
                              reagents=reagents,
                              mgf=args.generate_mgf,
                              interference=args.quantify_interference,
                              impurities=impurities,
                              metrics=args.metrics,
                              boxcar=args.boxcar,
                              isolationOffset=args.isolation_window_offset)
                for msFile in files)