def main(): total_timer = Timer() iteration_timer = Timer() short_timer = Timer() total_timer.start() logger.success(' --- Welcome to the Kwantum Transmission Device Inspector --- ') parser = Parser() pool = Pool(nodes=parser.config['n_cpus']) logger.info('Running calculations with ' + str(parser.config['n_cpus']) + ' workers.') ga = GA(parser, objective_function=objectiveFunction) structures = ga.generator.generateAll(pool=pool, seeds=np.random.randint(0, 2**32 - 1, parser.config['GA']['n_structures'])) s = structures[0] s.visualizeSystem(args={'dpi': 600, 'file': 'system.png'}) # fig, axes = plt.subplots(3, 2, figsize=(10,15)) import matplotlib.gridspec as gridspec fig = plt.figure(figsize=(40,10)) outer = gridspec.GridSpec(2, 1) top = gridspec.GridSpecFromSubplotSpec(1, 5, subplot_spec=outer[0], wspace=0.2, hspace=0.2) bs_axis = plt.Subplot(fig, top[0]) ms, bs = s.getBandStructure(0) bs_axis.plot(ms, bs, c='k') bs_axis.set_xlabel('Wavenumber [\AA${}^{-1}$]') bs_axis.set_ylabel('Energy [eV]') fig.add_subplot(bs_axis) cs_axis = plt.Subplot(fig, top[1]) es, cs = s.getConductance(0, 1) cs_axis.plot(es, cs, c='k') cs_axis.set_ylabel('Transmission Function') cs_axis.set_xlabel('Energy [eV]') fig.add_subplot(cs_axis) dos_axis = plt.Subplot(fig, top[2]) es, ds = s.getDOS() dos_axis.plot(es, ds / np.sum(ds), c='k') dos_axis.set_ylabel('Density of States') dos_axis.set_xlabel('Energy [eV]') dos_axis.set_ylim([0.0, 0.1 * np.max(ds / np.sum(ds))]) fig.add_subplot(dos_axis) vcs_axis = plt.Subplot(fig, top[3]) cvs = [s.getValleyPolarizedConductance(energy, 0, 1) for energy in es] cvs = np.array(cvs) vcs_axis.plot(es, cvs[:, 0], 'k', label='$k\'$') vcs_axis.plot(es, cvs[:, 1], 'k--', label='$k$') vcs_axis.set_ylabel('Transmission Function') vcs_axis.set_xlabel('Energy [eV]') # vcs_axis.set_xlim([-0.5, 0.5]) vcs_axis.legend() fig.add_subplot(vcs_axis) crs_axis = plt.Subplot(fig, top[4]) biases = np.linspace(0.05, 0.5, 64) threeK = 0.00025851991 currents_3 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0]) vcs_3 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0]) vcs_3 = np.array(vcs_3) currents_30 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0]) vcs_30 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK*10]*biases.shape[0]) vcs_30 = np.array(vcs_30) currents_300 = pool.map(s.getCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK]*biases.shape[0]) vcs_300 = pool.map(s.getValleyPolarizedCurrent, [0]*biases.shape[0], [1]*biases.shape[0], biases, [threeK*100]*biases.shape[0]) vcs_300 = np.array(vcs_300) crs_axis.semilogy(biases, currents_3, 'k', label='Total - 3 K') crs_axis.semilogy(biases, vcs_3[:,0], 'r', label='$k\'$ - 3 K') crs_axis.semilogy(biases, vcs_3[:,1], 'b', label='$k$ - 3 K') crs_axis.semilogy(biases, currents_30, 'k--', label='Total - 30 K') crs_axis.semilogy(biases, vcs_30[:,0], 'r--', label='$k\'$ - 30 K') crs_axis.semilogy(biases, vcs_30[:,1], 'b--', label='$k$ - 30 K') crs_axis.semilogy(biases, currents_300, 'k-.', label='Total - 300 K') crs_axis.semilogy(biases, vcs_300[:,0], 'r-.', label='$k\'$ - 300 K') crs_axis.semilogy(biases, vcs_300[:,1], 'b-.', label='$k$ - 300 K') crs_axis.set_xlabel('Bias [V]') crs_axis.set_ylabel('Current [$e / \pi \hbar$]') crs_axis.legend() fig.add_subplot(crs_axis) sys_axis = plt.Subplot(fig, outer[1]) s.visualizeSystem(args={'ax': sys_axis}) fig.add_subplot(sys_axis) for axis in fig.get_axes(): axis.grid(linestyle='--', linewidth=0.5) plt.tight_layout() plt.savefig('inspection.pdf') plt.show() logger.success(' --- Elapsed time: %s ---' % (total_timer.stop()))
OpenDir = args.OpenDir os.chdir(OpenDir) au_path = os.path.join('all_aus', 'au_*.hdf') if args.refresh: if not os.path.exists('all_aus'): os.mkdir('all_aus') PATIENT_DIRS = [ x for x in glob.glob('*cropped') if 'hdfs' in os.listdir(x) ] dfs = [] df = None patient_queue = multiprocessing.Manager().Queue() partial_patient_func = functools.partial(load_patient, patient_queue) with Pool() as p: max_ = len(PATIENT_DIRS) with tqdm(total=max_) as pbar: for i, _ in enumerate( p.imap(partial_patient_func, PATIENT_DIRS[:max_], chunksize=100)): pbar.update() df = dd.concat(dump_queue(patient_queue), interleave_partitions=True) del patient_queue gc.collect() # print(dd.stack(dfs)) # df = df.compute() df.to_hdf(au_path, '/data', format='table',
se.crosstalk(EXPFILE, '', **args) nccd = len(CCD) xfiles_list = glob.glob('*_xtalk.fits') if len(xfiles_list) < nccd: print " Possibly corrupted file expect %d extensions but got only %d \n" % ( nccd, len(xfiles_list)) sys.exit(-1) #running pixelcorrect and bleedmask se.link_from_Dcache(se.data_conf + 'default.psf') instrings = [] for ccd in CCD: ccdstring = "%02d" % int(ccd) instrings.append(ccdstring) pool = Pool(ncpu) pars = [(se, ccdstring) for ccdstring in instrings] pool.map(runL1P, pars) se.fileclean('xtalk', '.fits') se.fileclean('nullweight', '.fits') se.combineFiles('D' + ("%08d" % int(EXPNUM)) + '**sextractor.fits', 'Scamp_allCCD_r' + rRun + 'p' + pRun + '.fits') try: se.sanityCheck('D' + ("%08d" % int(EXPNUM)) + '_' + FILTER + '_01' + '_r' + rRun + 'p' + pRun + '_sextractor.fits') except ValueError as err: print(err.args) sys.exit(-1) se.fileclean('bpm', '.fits') se.fileclean('biascor', '.fits')
w = line["Word"] valenceList[w].append(line["V.Mean.Sum"]) arousalList[w].append(line["A.Mean.Sum"]) dominanceList[w].append(line["D.Mean.Sum"]) d = "@Ayerad no, well i hope not. He could ha hasnt been at school fer a wile but @koast08 doesnt believe he had cancer" a, b, c = generate_emotion_features(d) x_text, Y = data_helpers.load_data_and_y_labels("../data/MR/rt-polarity.pos", "../data/MR/rt-polarity.neg") valence_feature_list = [] arousal_feature_list = [] dominance_feature_list = [] p = Pool(4) for (valence_list, arousal_list, dominance_list) in p.map(generate_emotion_features, x_text): valence_feature_list.append(valence_list) arousal_feature_list.append(arousal_list) dominance_feature_list.append(dominance_list) valence_feature_list = numpy.expand_dims(valence_feature_list, axis=2) arousal_feature_list = numpy.expand_dims(arousal_feature_list, axis=2) dominance_feature_list = numpy.expand_dims(dominance_feature_list, axis=2) data_set = "MR" numpy.save("../dump/" + data_set + "/valence_feature_list", valence_feature_list) numpy.save("../dump/" + data_set + "/arousal_feature_list", arousal_feature_list)
G = nk.nxadapter.nx2nk(G) G.removeSelfLoops() #o = nk.overview(G) measures = graph_measures(G) data.loc[t] = measures data.to_csv(outpath) print('Save to %s' % outpath) except: print('Skip %s' % s) pass if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='./summaries/graphs', type=str) parser.add_argument("--out_dir", default='./summaries/baselines', type=str) args = parser.parse_args() files = os.listdir(args.data_dir) paths = [] for f in files: inpath = os.path.join(args.data_dir, f) outpath = os.path.join(args.out_dir, f) paths.append((inpath, outpath)) with Pool() as P: P.map(analyze_graphs, paths)
def main(args): def function_(year, args=args): extensions = ['hn1', 'hn3', 'hsr2000', 'hsr2500', 'hsrel', 'hstop'] language = ['de', 'en'] file_type = ['gif', 'png'] origin_paths = [] url = "https://www.slf.ch/fileadmin/user_upload/import/lwdarchiv/public" urls = [] y = year for ext in extensions: for lan in language: for f_type in file_type: origin = os.path.join(*[args.maps_directory,y, ext, lan, f_type])#,"*."+f_type]) if(Path(origin).exists()): origin_paths.append(os.path.join(*[origin, "*."+f_type])) urls.append("/".join([url, y, ext, lan, f_type])) for i, origin in enumerate(origin_paths): for file_map in glob.glob(origin): basename = os.path.basename(file_map) #print("name: " + basename) filename = '{}.json'.format(os.path.splitext(basename)[0]) destination = os.path.join(args.out_path, filename) file_url = urls[i]+"/"+basename #print("url: "+ file_url) if Path(destination).exists() and not args.f: print('Skip {} because {} already exists'.format(file_map, destination)) continue img = Image.open(file_map) img = img.convert('RGB') img_arr = np.array(img) # load mask of this size try: binary_mask, landmarks_pix = open_mask(*img_arr.shape[:2]) except FileNotFoundError: print('Missing mask "{}x{}.gif" for file "{}"'.format(*img_arr.shape[:2], file_map), file=sys.stderr) continue #remove grey colors nogrey_img_arr = remove_colors(img_arr, shades_grey) #build colormap color_map = build_color_map(nogrey_img_arr) #map image colors to registered shades new_img_arr = replace_color(nogrey_img_arr, color_map=color_map) # keep useful colors regions_only = keep_colors(new_img_arr, shades_blue) # clip the binary mask to remove color key regions_only[~binary_mask] = 255 regions_only = Image.fromarray(regions_only).convert('RGB') smoothed = regions_only.filter(ImageFilter.MedianFilter(7)) pix = np.array(list(map(numpify, landmarks_pix.values()))) coord = np.array(list(map(numpify, landmarks_pix.keys()))) # add 1 bias raw pix_ext = np.vstack([np.ones((1,pix.shape[0])), pix.T]) coord_ext = np.vstack([np.ones((1,pix.shape[0])), coord.T]) T = np.linalg.lstsq(pix_ext.T, coord_ext.T)[0] def transform_pix2map(points): """n x 2 array""" points_ext = np.hstack([np.ones((points.shape[0], 1)), points]) points_map = points_ext.dot(T) return points_map[:, 1:] geo_json = { "type": "FeatureCollection", "features": [] } for snow_level, color in enumerate(shades_blue): for contour in color_contours(smoothed, color): contour_right = contour.copy() contour_right[:,0] = contour[:,1] contour_right[:,1] = contour[:,0] contour_right = transform_pix2map(contour_right) simplifier = vw.Simplifier(contour_right) contour_right = simplifier.simplify(threshold=SMOOTHING_THRESHOLD) geo_json['features'].append({ "type": "Feature", "properties": { "date": ".".join([basename[6:8], basename[4:6], basename[0:4]]), "snow_level": snow_level_legend[int(snow_level)], "url": file_url }, "geometry": { "type": "Polygon", "coordinates": [ list(reversed(contour_right.tolist())) ] } }) with open(destination, 'w') as f: print('{} -> {}'.format(file_map, destination)) json.dump(geo_json, f) with Pool(8) as p: p.map(function_, [str(i) for i in range(2002, 2018)])
def main(): args = parser.parse_args() fastadir = str(Path(args.fastadir).absolute()) hmmdir = str(Path(args.hmmdir).absolute()) outdir = str(args.outdir) threshold = float(args.evalue) threads = int(args.threads) already_scanned = args.already_scanned no_seqs = args.no_seqs p = Pool(threads) # Make output directory if not os.path.exists(outdir): os.system('mkdir ' + outdir) outdir = str(Path(outdir).absolute()) else: outdir = str(Path(outdir).absolute()) # Get list of paths of all fastas fastalist_wpath = list( map(lambda file: os.path.join(fastadir, file), os.listdir(fastadir))) # Get list of all fastas fastalist = list( map(lambda file: file.split('.faa')[0], os.listdir(fastadir))) # Get list of paths of all HMM files hmmlist_wpath = list( map(lambda file: os.path.join(hmmdir, file), os.listdir(hmmdir))) # Get list of all HMMs hmmlist = list(map(lambda file: file.split('.hmm')[0], os.listdir(hmmdir))) hmm_outfiles = [] def get_fastaheader_id(fasta): for rec in SeqIO.parse(fasta, 'fasta'): if '.peg' in rec.id: id = rec.id.split('.peg')[0] elif '|' in rec.id: id = rec.id.split('|')[0] else: print('Unrecognized header found. Aborting.') sys.exit() break return id #Get list of fasta header IDs by mapping to get_fastaheader_id fn fasta_header_ids = list(map(get_fastaheader_id, fastalist_wpath)) #Make fasta dictionary (hopefully deprecated, let's see; dec 18 3:58 mountain time) fastadict = dict(zip(fasta_header_ids, fastalist)) # For each fasta, run all hmms if not already_scanned: for fastafile in fastalist_wpath: fastaoutdir = outdir + '/' + fastafile.split('/')[-1].split( '.faa')[0] # Make outdir for HMMs if not os.path.exists(fastaoutdir): os.system('mkdir ' + outdir + '/' + fastafile.split('/')[-1].split('.faa')[0]) #Make symbolic link os.system('ln -s ' + fastafile + ' ' + fastaoutdir + '/') hmm_outfiles.append([]) # Run all HMMs for fastafile hmm_outfiles[-1] = list(p.map(lambda hmmfile: run_hmmsearch(fastafile, hmmfile, outdir, threshold), \ hmmlist_wpath)) # Move all outfiles to corresponding output directory for outfile in hmm_outfiles[-1]: os.system('mv ' + outdir + '/' + outfile + ' ' + fastaoutdir) # Make directory to store fastas if not os.path.exists(outdir + '/' + 'fastas'): os.system('mkdir ' + outdir + '/' + 'fastas') # Make matrix of zeros to store hits hits_by_hmm = [] #Declare function to get hits for each HMM def extract_all_hits(fastaname, hmm): fastadir = outdir + '/' + fastaname #Get name of appropriate hmmfile, path hmmhits_for_fasta = list( filter(lambda x: hmm in x, os.listdir(fastadir))) hits = extract_hits_by_outfile(fastadir, hmmhits_for_fasta) return hits for hmm in hmmlist: print("Extracting hits for: ", hmm) relevant_outfiles = [] hits_by_hmm.append([ list( p.map(lambda fastaname: extract_all_hits(fastaname, hmm), fastalist)), hmm ]) print("Making hits matrix...") hitstable = np.zeros((len(hmmlist), len(fastalist))) # Mark hits in table for hmm_idx, hmm in enumerate(hits_by_hmm): for genome_idx, genome_hits in enumerate(hmm[0]): if type(genome_hits) is list: hits = len(genome_hits) elif type(genome_hits) is str: hits = 1 if genome_hits is None: hitstable[hmm_idx][genome_idx] = 0 else: hitstable[hmm_idx][genome_idx] = hits hits = pd.DataFrame(hitstable).T hits.columns = hmmlist hits['id'] = fastalist cols = list(hits.columns.values) cols.pop(cols.index('id')) hits = hits[['id'] + cols] hits.to_csv(outdir + '/HITSTABLE.tsv', sep='\t', index=False) if not no_seqs: hmms_written = list( p.map( lambda hits: get_recs_for_hits(hits[0], hits[ 1], fastadict, fastalist_wpath, fastalist, outdir), hits_by_hmm)) for hmm in hmmlist: if hmm not in hmms_written: print(hmm) sys.exit() # recs_by_hmm = list(map(lambda hits: get_recs_for_hits(hits), hits_by_hmm)) print('boogie')
def surfsearchlight(surf, datafile, func, radius=3, openmp=True, mp=4,\ outprefix=None, intent=2005, verbose=False, method='3dsphere'): ''' def surfsearchlight(surf, datafile, func, radius=3, openmp=True, mp=4,\ outprefix=None, intent=2005): Perform surface-based search light analysis. Input: <surf>: a string, or path object indicate a surface file either in freesurfer format or .gii format. We read in this surface file to obtain geometry of surface vertices <datafile>: can be (1), .gii, .gii.gz surface data file, it should be in .gii format and the data should be in nVert x M matrix, nVert is the number of vertex, M columns are data (2), nVert x M matric that internally can be directly used <func>: The function object to calculate, it takes in data file and generate output. Note that func either output a single value or output a tuple for multiple results <radius>: in mm (default: 3), radius to include vertex <openmp>: boolean, whether to use parallal computing, (default=True) <mp>: how many cores to open, default:20 <outPrefix>: a string, we save to a .gii file, if you want to save to .gii file ,you must supply a .gii file for <datafile> <intent>: an int or a list of ints, intent number for each column of result array. This is necessary when saving results into a .gii file. check savegifti.py for more info <method>: (1) '3dsphere' (default), including vertex within a 3dsphere, typically run on a sphere or inflated surface (2) 'geodensic', using geodensic distance, which is more accurate but take a long time currently, this method seems problematic, I would not recommand this Output: We save a .gii file with the output 20190413 RZ add <method> 20190412 RZ created the file ''' from numpy import ndarray, vstack, hstack, array, arange, where from RZutilpy.system import unix_wrapper, Path, gettimestr from RZutilpy.mri import savegifti from numpy import ndarray, vstack, array, arange from nibabel import load from nibabel.freesurfer.io import read_geometry from time import time from sklearn.neighbors import NearestNeighbors from pathos.multiprocessing import Pool from surfdist import surfdist # first read the surf file surf = Path(surf) if ~isinstance(surf, Path) else surf if surf.suffix == '.gii': # .gii format vtrx, faces = load(surf.str).darrays[0].data, load( surf.str).darrays[1].data else: # freesurfer format vtrx, faces = read_geometry(surf.str) # and read the data file if not isinstance(datafile, ndarray) and isinstance(datafile, str): datafile = Path(datafile) assert datafile.suffix == '.gii', 'data file should be .gii format!' giftiobj = load(datafile.str) data = [i.data for i in giftiobj.darrays] data = vstack(data).T # now data is nVert x M columns data file else: data = datafile del datafile # assert same number of vertices in surface and data assert data.shape[0] == vtrx.shape[ 0], 'surface file and data have different number of vertices!' nVtrx = data.shape[0] index = range(nVtrx) # calculate neighbour if method == '3dsphere': neigh = NearestNeighbors(radius=radius, metric='euclidean', n_jobs=mp) neigh.fit(vtrx) # in this case we first fit to the x, y, z nbrs = neigh.radius_neighbors(vtrx, return_distance=False) elif method == 'geodesic': # slow... do not recommand # using surfdist def calcneighbors(i): print(i) dist = surfdist.dist_calc((vtrx, faces), index, i) return where(dist <= radius)[0] with Pool(mp) as p: # use imap, the returned results are in order #b = p.imap(calcneighbors, index, chunksize=2000) b = p.imap(calcneighbors, range(2000), chunksize=2000) nbrs = list(b) # note that nbrs is a ndarray, each element is an array since each element might have different # Define the wrapper function def runsearchlight(i): # get the index of neighbors idx = nbrs[i] # get the data of neighbors data_i = data[idx, :] if verbose: print(i) return func(data_i) # do it tstr = gettimestr('full') with Pool(mp) as p: # use imap, the returned results are in order b = p.imap(runsearchlight, arange(vtrx.shape[0]), chunksize=2000) #b = p.imap(runsearchlight, arange(2000), chunksize=2000) data2save = list(b) print(f'searchlight starts from {tstr}') print(f'searchlight ends at {gettimestr("full")}') # let's take about 1d or 2d nCol = len(data2save[0]) if isinstance(data2save[0], tuple) else 1 data2save = array(data2save) if nCol == 1 else vstack(data2save) # save the file if outprefix: assert 'giftiobj' in locals( ), 'You must input a .gii file for data if you want to save result to .gii' savegifti(data2save, outprefix, giftiobj, intent) return data2save
NUM_SURR = 1000 NUM_WORKERS = 20 net = ScaleSpecificNetwork('%sair.mon.mean.levels.nc' % path_to_data, 'air', date(1948, 1, 1), date(2015, 1, 1), None, None, 0, dataset="NCEP", sampling='monthly', anom=False) pool = Pool(NUM_WORKERS) net.wavelet(1, 'y', pool=pool, cut=1) net.get_continuous_phase(pool=pool) net.get_phase_fluctuations(rewrite=True, pool=pool) pool.close() pool.join() nao = DataField() raw = np.loadtxt("%sWeMO.monthly.1821-2013.txt" % (path_to_data)) raw = raw[:, 1:] nao.data = raw.reshape(-1) nao.create_time_array(date_from=date(1821, 1, 1), sampling='m') nao.select_date(date(1949, 1, 1), date(2014, 1, 1)) nao.anomalise() jfm_index = nao.select_months([1, 2, 3], apply_to_data=False)
if len(graph.clusters().subgraph(y).vs) < 500: break g = graph.clusters().subgraph(y) g_edges = edges.loc[edges.id.isin(g.es()['id'])] g_nodes = nodes.loc[nodes.id.isin(g.vs()['id'])] g_edges, g_nodes = reset_ids(g_edges,g_nodes) feather.write_dataframe(g_edges,"/scistor/ivm/data_catalogue/open_street_map/percolation_networks/"+x+"_"+str(counter)+"-edges.feather") feather.write_dataframe(g_nodes,"/scistor/ivm/data_catalogue/open_street_map/percolation_networks/"+x+"_"+str(counter)+"-nodes.feather") g_df = metrics(g) g_df.to_csv("/scistor/ivm/data_catalogue/open_street_map/percolation_metrics/"+x+"_"+str(counter)+"_metrics.csv") counter += 1 print(x+' has finished!') except Exception as e: print(x+" failed because of {}".format(e)) if __name__ == '__main__': #countries = ['ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CIV', 'CMR', 'COD', 'COG', 'C*K', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RWA', 'SAU', 'SDN', 'SEN', 'SGP', 'SHN', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', 'URY', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'XAD', 'XCA', 'XKO', 'XNC', 'YEM', 'ZAF', 'ZMB', 'ZWE'] #countries is without CHN, DEU, RUS, USA countries = [y[:3] for y in os.listdir("/scistor/ivm/data_catalogue/open_street_map/road_networks/")] fin_countries = [y[:3] for y in os.listdir("/scistor/ivm/data_catalogue/open_street_map/percolation_metrics/")] left_countries = list(set(countries)-set(fin_countries)) left_countries = [x[:3] for x in left_countries] from random import shuffle #shuffle(left_countries) #left_countries = ['BRB', 'BTN', 'KNA', 'GUY', 'NFK', 'BLZ', 'WLF', 'SHN', 'WSM', 'KIR', 'MCO', 'VUT', 'TUV', 'XAD','ASM','FSM','MHL','PLW','VGB','MDV','SLB','VCT'] print(left_countries) with Pool(10) as pool: pool.map(split_record,left_countries,chunksize=1)
:param patient_dirs: All directories ran through OpenFace. :param patient: Patient to find annotated emotions for """ tqdm_position, patient = patient curr_dirs = [x for x in patient_dirs if patient in x] for patient_dir in tqdm(curr_dirs, position=tqdm_position): find_scores(patient_dir, refresh) if __name__ == '__main__': OPEN_DIR = sys.argv[sys.argv.index('-d') + 1] refresh = '--refresh' in sys.argv os.chdir(OPEN_DIR) # Directories have been previously cropped by CropAndOpenFace PATIENT_DIRS = [ x for x in glob.glob('*cropped') if 'hdfs' in os.listdir(x) ] PATIENTS = get_patient_names(PATIENT_DIRS) # EYEBROW_DICT = process_eyebrows(OPEN_DIR, # open(join(OPEN_DIR, 'eyebrows.txt'))) PARTIAL_FIND_FUNC = functools.partial(find_one_patient_scores, PATIENT_DIRS, refresh) TUPLE_PATIENTS = [((i % 5), x) for i, x in enumerate(PATIENTS)] Pool(5).map(PARTIAL_FIND_FUNC, TUPLE_PATIENTS) # Pool().map(find_scores, PATIENTS) # for i, x in enumerate(PATIENTS): # tuple_patient = (i % cpu_count(), x) # find_one_patient_scores(PATIENT_DIRS, tuple_patient)
def generate_SVM_arrays_around_cds_start(list_input_transcripts, path_to_harringtonine_reads, path_to_harringtonine_psite, nthreads): """Generates positive and negative vectors for training SVM to predict start peaks using Harringtonine ribosome profiling data. (1) Loops through list of input transcripts, generating positive and negative example vectors from each transcript (a) Using the annotated_CDS_start for each transcript as a positive example, constructs the Ingolia vector (b) Constructs Ingolia vectors for 10 negative locations on each transcript as specified in 2011 cell paper (2) Concatenates positive and negative vectors into a single numpy array which is output (3) Returns a second array with labels for each vector (1=positive example, 0=negative example) --Input-- list_input_transcripts: iterable containing plastid transcript objects to be processed. All supplied transcripts are processed, so be sure you have filtered for highly expressed transcripts, and split into test/training groups beforehand. path_to_harringtonine_reads: path to .bam file containing harringtonine reads used to construct the arrays path_to_harringtonine_psite: path to file generated by plastid psite script giving psite offsets for harringtonine reads. nthreads: number cores to use for parallel processing. Please note that this uses pathos.multiprocessing ProcessingPool and it expects to find 'Pool' defined globally. You MUST run 'from pathos.multiprocessing import ProcessingPool as Pool' before trying this function! --Output-- Two np arrays: the first contains Ingolia vectors constructed from the input list. The second contains assignments (1=positive example, ie. vector constructed from annotated CDS_start locations. 0=negative example, constructed as specified in 2011 Cell paper). There are 10 negative examples per 1 positive example on each of the input transcripts passing basic QC filtering. """ def process_single_transcript_forSVM(input_transcript, path_to_harringtonine_reads, path_to_harringtonine_psite): print 'Working on ' + input_transcript.get_name() + '...' #Set up harringtonine reads harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads) harringtonine_reads.set_mapping( VariableFivePrimeMapFactory.from_file( open(path_to_harringtonine_psite))) #Set up vectors to append into positive_vectors = [] negative_vectors = [] #Ensure transcript competant to be a test example start_codon_nt = input_transcript.cds_start if not start_codon_nt - 25 > 0 or not start_codon_nt + 190 < input_transcript.get_length( ): #Ingolia says -18 on each side of initiation site scoring window. Noting that scoring window is -7 to 40 nt from given site, accounting for negative vector should be 150+40=190 on positive bound. return positive_vectors, negative_vectors #Create the vectors count_vector = input_transcript.get_counts(harringtonine_reads) positive_vectors.append( construct_Ingolia_vector(start_codon_nt, count_vector)) for z in [-6, -3, 3, 9, 18, 30, 60, 90, 120, 150]: negative_vectors.append( construct_Ingolia_vector(start_codon_nt + z, count_vector)) # print '...Done!' return positive_vectors, negative_vectors # output = Pool(nthreads).map(process_single_transcript_forSVM, list_input_transcripts, itertools.repeat(path_to_harringtonine_reads), itertools.repeat(path_to_harringtonine_psite)) #Unpack mapped output. Each iteration is output as a list, which contains two sub-lists with positive and negative vectors positive_vectors = [x[0] for x in output if len(x[0]) > 0] negative_vectors = [ x[1] for x in output if len(x[0]) > 0 ] #if x[0] = [], x[1] will also = []. This just removes iterations where the transcript didn't pass QC #Second level of unpacking. Take list of lists of vectors to just a list of vectors positive_vectors = list(itertools.chain.from_iterable(positive_vectors)) negative_vectors = list(itertools.chain.from_iterable(negative_vectors)) ingolia_vector_array = np.asarray(positive_vectors + negative_vectors) identities_for_array = np.asarray([1] * len(positive_vectors) + [0] * len(negative_vectors)) # return ingolia_vector_array, identities_for_array
#cv_SVM_human = dill.load(open('hg38_HEK_SVM.obj')) #Compile Regex for Ranges canonical_start_codon = re.compile('ATG', re.IGNORECASE) nearcanonical_start_codon = re.compile( '(CTG)|(GTG)|(TTG)|(ACG)|(AGG)|(ATC)|(ATT)|(AAG)|(ATA)', re.IGNORECASE) #figure out non-canonicals. stop_codon = re.compile('(TAA)|(TAG)|(TGA)', re.IGNORECASE) #Run Main Script. Note that because we have used refit=True with GridSearchCV we can use the SVM directly in the function (which just requires that the classifier has a .predict() method.) annotated_ORFs = Pool(23).map(assign_uORFs_from_harr_peaks, highly_translated, itertools.repeat('file.path'), itertools.repeat('file.path'), itertools.repeat(cv_SVM_human), itertools.repeat(True), itertools.repeat(scaler_grid), itertools.repeat(hg38_genome), itertools.repeat(canonical_start_codon), itertools.repeat(nearcanonical_start_codon), itertools.repeat(stop_codon), itertools.repeat(50)) #Reformat uORF annotations in a logical way. Start here if coming back from running a model with known settings. annotated_ORFs_final = [i for i in annotated_ORFs if len(i) > 0] annotated_ORFs_final = list( itertools.chain.from_iterable(annotated_ORFs_final)) #dill.dump(annotated_ORFs_final, open('HEK293T_uORFs.obj','wb')) #annotated_ORFs_final = dill.load(open('HEK293T_uORFs.obj')) #Have a look at the data first to see how we did. Looks fine. Definitely fewer 5'UTR reads and fewer uORFs. Cell culture probs? fout = open('HEK293T_uORFs.bed', 'w')
def climByAveragingPeriods(urls, # list of (daily) granule URLs for a long time period (e.g. a year) nEpochs, # compute a climatology for every N epochs (days) by 'averaging' nWindow, # number of epochs in window needed for averaging nNeighbors, # number of neighbors on EACH side in lat/lon directions to use in averaging variable, # name of primary variable in file mask, # name of mask variable coordinates, # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon') splitFn=splitModisSst, # split function to use to partition the input URL list maskFn=qcMask, # mask function to compute mask from mask variable averager='pixelMean', # averaging function to use, one of ['pixelMean', 'gaussInterp', 'spatialFilter'] averagingConfig={}, # dict of parameters to control the averaging function (e.g. gaussInterp) optimization='fortran', # optimization mode (fortran or cython) mode='sequential', # Map across time periods of N-days for concurrent work, executed by: # 'sequential' map, 'multicore' using pool.map(), 'cluster' using pathos pool.map(), # or 'spark' using PySpark numNodes=1, # number of cluster nodes to use nWorkers=4, # number of parallel workers per node averagingFunctions=AveragingFunctions, # dict of possible averaging functions legalModes=ExecutionModes, # list of possible execution modes cachePath=CachePath # directory to cache retrieved files in ): '''Compute a climatology every N days by applying a mask and averaging function. Writes the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary. ***Assumption: This routine assumes that the N grids will fit in memory.*** ''' if averagingConfig['name'] == 'gaussInterp': averagingConfig['wlat'] = nNeighbors averagingConfig['wlon'] = nNeighbors try: averageFn = averagingFunctions[averager] except: print('climatology: Error, Averaging function must be one of: %s' % str(averagingFunctions), file=sys.stderr) sys.exit(1) urlSplits = [s for s in splitFn(urls, nEpochs)] def climsContoured(urls, plot=None, fillValue=default_fillvals['f4'], format='NETCDF4', cachePath=cachePath): n = len(urls) if VERBOSE: print(urls, file=sys.stderr) var = climByAveraging(urls, variable, mask, coordinates, maskFn, averageFn, averagingConfig, optimization, cachePath) fn = os.path.split(urls[0])[1] inFile = os.path.join(cachePath, fn) method = averagingConfig['name'] fn = os.path.splitext(fn)[0] day = fn[5:8] nDays = int(var['time'][0]) if 'wlat' in averagingConfig: wlat = averagingConfig['wlat'] else: wlat = 1 if int(wlat) == wlat: outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%dnbrs.nc' % (day, nDays, method, int(wlat)) # mark each file with first day in period else: outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%4.2fnbrs.nc' % (day, nDays, method, wlat) # mark each file with first day in period outFile = writeOutNetcdfVars(var, variable, mask, coordinates, inFile, outFile, fillValue, format) if plot == 'contour': figFile = contourMap(var, variable, coordinates, n, outFile) elif plot == 'histogram': # figFile = histogram(var, variable, n, outFile) figFile = None else: figFile = None return (outFile, figFile) if mode == 'sequential': results = list(map(climsContoured, urlSplits)) elif mode == 'multicore': pool = Pool(nWorkers) results = pool.map(climsContoured, urlSplits) elif mode == 'cluster': pass elif mode == 'spark': pass return results
y = np.sqrt(1. - u[i]) * np.sin(theta[j]) z = u[i] startpoints.append(radius * np.array([x, y, z])) return startpoints def run_mfpts_from_bath(bathRad, numpoints, scalefactor, dt): np.random.seed() asympot3D = potentials.asym3Dpotential(scalefactor=scalefactor) p1 = mrd.particle(np.zeros(3), 1.0) sphereboundary = mrd.reflectiveSphere(bathRad) integrator = integrators.brownianDynamicsSp(asympot3D, sphereboundary, p1, dt, 1.0) sim = mrd.simulation(integrator) startpoints = get_startpoints(numpoints, bathRad - radiusThreshold) fpts = [] for startpoint in startpoints: integrator.pa.position = startpoint integrator.clock = 0. fpts.append(sim.run_mfpt_points(np.array(minima), 0.2)) print 'bath to ' + str(bathRad) return np.array(fpts) pool = Pool(processes=8) FPT_list = pool.map( partial(run_mfpts_from_bath, scalefactor=2.0, numpoints=runs, dt=0.001), radii) dill.dump(FPT_list, open(path + 'fpts_on_' + str(runs * runs) + '_runs' + suffix, 'wa'))
def init_data(self, data_name, n_chunk=1024): print(f'Initializing {data_name} data...') def transform_triple_to_hrt(triple_idx): """ Transforms triple-idx (as a whole) to h/r/t format """ if triple_idx == -1: # for response_triple return NAF_TRIPLE triple = self.idx2triple[triple_idx] h, r, t = triple.split(', ') return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]] def process_file(root, inp): start_i, filename = inp n_sample = line_count(filename) post = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) post_length = np.zeros( (n_sample), dtype=np.int32) # valid length (without pad) response = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) response_length = np.zeros((n_sample), dtype=np.int32) # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) triple = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len, 3), dtype=np.int32) entity = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len), dtype=np.int32) response_triple = np.zeros( (n_sample, self.args.max_sentence_len, 3), dtype=np.int32) max_post_len, max_response_len, max_triple_len = 0, 0, 0 with jsonlines.open(filename) as df: for i, line in enumerate(df): pl, rl = len(line['post']) + 2, len(line['response']) + 2 post_length[i] = pl response_length[i] = rl max_post_len = max(pl, max_post_len) max_response_len = max(rl, max_response_len) max_triple_len = max([len(l) for l in line['all_triples']] + [max_triple_len]) all_triples = [ line['all_triples'][i - 1] if i > 0 else [-1] for i in line['post_triples'] ] post[i, :pl] = [SOS_IDX] + [ self.get_word_idx(p) for p in line['post'] ] + [EOS_IDX] response[i, :rl] = [SOS_IDX] + [ self.get_word_idx(r) for r in line['response'] ] + [EOS_IDX] # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...] response_triple[i, :rl] = [NAF_TRIPLE] + [ transform_triple_to_hrt(rt) for rt in line['response_triples'] ] + [NAF_TRIPLE] # put NAF_TRIPLE/entity at index 0 triple[i] = pad_2d( [[NAF_TRIPLE]] + [[transform_triple_to_hrt(t) for t in triples] for triples in all_triples] + [[NAF_TRIPLE]], length=(self.args.max_sentence_len, self.args.max_triple_len, 3)) entity[i] = pad_2d( [[NAF_IDX]] + [[self.entidx2wordidx[e] for e in entities] for entities in line['all_entities']] + [[NAF_IDX]], length=(self.args.max_sentence_len, self.args.max_triple_len)) # dump to zarr root['post'][start_i:start_i + n_sample] = post root['post_length'][start_i:start_i + n_sample] = post_length root['response'][start_i:start_i + n_sample] = response root['response_length'][start_i:start_i + n_sample] = response_length # root['post_triple'][start_i : start_i+n_sample] = post_triple root['triple'][start_i:start_i + n_sample] = triple root['entity'][start_i:start_i + n_sample] = entity root['response_triple'][start_i:start_i + n_sample] = response_triple return max_post_len, max_response_len, max_triple_len toread = [ f'{self.data_path}/{data_name}set_pieces/{piece}' for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces') ] n_lines = sum([line_count(piece) for piece in toread]) init_n_lines = math.ceil( n_lines / n_chunk) * n_chunk # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지 root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w') post = root.zeros('post', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') post_length = root.zeros('post_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') # valid length (without pad) response = root.zeros('response', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') response_length = root.zeros('response_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') post_triple = root.zeros('post_triple', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') triple = root.zeros('triple', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len, 3), chunks=(n_chunk, None, None, None), dtype='i4') entity = root.zeros('entity', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len), chunks=(n_chunk, None, None), dtype='i4') response_triple = root.zeros('response_triple', shape=(init_n_lines, self.args.max_sentence_len, 3), chunks=(n_chunk, None, None), dtype='i4') pool = Pool(min(len(toread), mp.cpu_count())) func = functools.partial(process_file, root) iterinp = [(i * self.args.data_piece_size, filename) for i, filename in enumerate(toread)] max_post_lens, max_response_lens, max_triple_lens = zip( *tqdm(pool.imap(func, iterinp), total=len(iterinp))) max_post_len, max_response_len, max_triple_len = max( max_post_lens), max(max_response_lens), max(max_triple_lens) # trim remaining space post.resize(n_lines, max_post_len) post_length.resize(n_lines) response.resize(n_lines, max_response_len) response_length.resize(n_lines) post_triple.resize(n_lines, max_post_len) triple.resize(n_lines, max_post_len, max_triple_len, 3) entity.resize(n_lines, max_post_len, max_triple_len) response_triple.resize(n_lines, max_response_len, 3) print( f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')
def ice_archive(d18Oice, pr_ann, tas_ann, psl_ann, nproc=8): ''' Accounts for diffusion and compaction in the firn. Args: d18Oice (1d array: year in int): annualizd d18O of ice [permil] pr_ann (1d array: year in int): precipitation rate [kg m-2 s-1] tas_ann (1d array: year in int): annualizd atomspheric temerature [K] psl_ann (1d array: year in int): annualizd sea level pressure [Pa] nproc (int): the number of processes for multiprocessing Returns: ice_diffused (1d array: year in int): archived ice d18O [permil] ''' # ====================================================================== # A.0: Initialization # ====================================================================== # accumulation rate [m/yr] # note that the unit of pr_ann is [kg m-2 s-1], so need to divide by density [kg m-3] and convert the time yr2sec_factor = 3600*24*365.25 accum = pr_ann/1000*yr2sec_factor # depth horizons (accumulation per year corresponding to depth moving down-core) bdown = accum[::-1] bmean = np.mean(bdown) depth = np.sum(bdown) depth_horizons = np.cumsum(bdown) dz = np.min(depth_horizons)/10. # step in depth [m] Tmean = np.mean(tas_ann) # unit in [K] Pmean = np.mean(psl_ann)*9.8692e-6 # unit in [Atm] # contants rho_s = 300. # kg/m^3, surface density rho_d = 822. # kg/m^2, density at which ice becomes impermeable to diffusion rho_i = 920. # kg/m^3, density of solid ice # ====================================================================== # A.1: Compaction Model # ====================================================================== z = np.arange(0, depth, dz) + dz # linear depth scale # set density profile by calling densification function rho, zieq, t = densification(Tmean, bmean, rho_s, z) rho = rho[:len(z)] # cutoff the end time_d = np.cumsum(dz/bmean*rho/rho_i) ts = time_d*yr2sec_factor # convert time in years to ts in seconds # integrate diffusivity along the density gradient to obtain diffusion length D = diffusivity(rho, Tmean, Pmean, rho_d, bmean) D = D[:-1] rho = rho[:-1] diffs = np.diff(z)/np.diff(time_d) diffs = diffs[:-1] # Integration using the trapezoidal method # IMPORTANT: once the ice reaches crtiical density (solid ice), there will no longer # be any diffusion. There is also numerical instability at that point. Set Sigma=1E-13 for all # points below that threshold. # Set to 915 to be safe. solidice = np.where(rho >= rho_d-5.0) diffusion = np.where(rho < rho_d-5.0) dt = np.diff(ts) sigma_sqrd_dummy = 2*np.power(rho, 2)*dt*D sigma_sqrd = integrate.cumtrapz(sigma_sqrd_dummy) diffusion_array = diffusion[0] diffusion_array = diffusion_array[diffusion_array < len(sigma_sqrd)] # fzhu: to avoid the boundary index error diffusion = np.array(diffusion_array) # rho=rho[0:-1] # modified by fzhu to fix inconsistency of array size # sigma=np.zeros((len(rho)+1)) # modified by fzhu to fix inconsistency of array size sigma = np.zeros((len(rho))) sigma[diffusion] = np.sqrt(1/np.power(rho[diffusion],2)*sigma_sqrd[diffusion]) # modified by fzhu to fix inconsistency of array size #sigma[solidice]=np.nanmax(sigma) #max diffusion length in base of core // set in a better way. max(sigma) sigma[solidice] = sigma[diffusion][-1] sigma = sigma[:-1] # ====================================================================== # A.2. Diffusion Profile # ====================================================================== # Load water isotope series del18 = np.flipud(d18Oice) # NOTE YOU MIGHT NOT NEED FLIP UD here. Our data goes forward in time. # interpolate over depths to get an array of dz values corresponding to isotope values for convolution/diffusion iso_interp = np.interp(z, depth_horizons, del18) # Return a warning if the kernel length is approaching 1/2 that of the timeseries. # This will result in spurious numerical effects. zp = np.arange(-100, 100, dz) if (len(zp) >= 0.5*len(z)): print("Warning: convolution kernel length (zp) is approaching that of half the length of timeseries. Kernel being clipped.") bound = 0.20*len(z)*dz zp = np.arange(-bound, bound, dz) # print('start for loop ...') # start_time = time.time() rm = np.nanmean(iso_interp) cdel = iso_interp-rm diffused_final = np.zeros(len(iso_interp)) if nproc == 1: for i in tqdm(range(len(sigma))): sig = sigma[i] part1 = 1./(sig*np.sqrt(2.*np.pi)) part2 = np.exp(-zp**2/(2*sig**2)) G = part1*part2 # diffused = np.convolve(G, cdel, mode='same')*dz # fzhu: this is way too slow diffused = signal.fftconvolve(cdel, G, mode='same')*dz # put cdel in the front to keep the same length as before diffused += rm # remove mean and then put back diffused_final[i] = diffused[i] else: # print('Multiprocessing: nproc = {}'.format(nproc)) def conv(sig, i): part1 = 1./(sig*np.sqrt(2.*np.pi)) part2 = np.exp(-zp**2/(2*sig**2)) G = part1*part2 diffused = signal.fftconvolve(cdel, G, mode='same')*dz diffused += rm # remove mean and then put back return diffused[i] res = Pool(nproc).map(conv, sigma, range(len(sigma))) diffused_final[:len(res)] = np.array(res) # print('for loop: {:0.2f} s'.format(time.time()-start_time)) # take off the first few and last few points used in convolution diffused_timeseries = diffused_final[0:-3] # Now we need to pack our data back into single year data units based on the depths and year interpolated data final_iso = np.interp(depth_horizons, z[0:-3], diffused_timeseries) ice_diffused = final_iso return ice_diffused
time.sleep(.05) result = {'ID':x[0], 'sepArea':x[1] * x[2], 'petArea':x[3] * x[4]} if verbose: print(f'done with: {x[0]}') return result data = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv') data.reset_index(inplace=True) #Use idx as identifier ## Convert dataset to tuple with names irises = tuple(data.itertuples(name='Iris', index=False)) tIris = [tuple(_) for _ in irises] ### Multiprocessing implementation pool4 = Pool(4) pool8 = Pool(8) pool16 = Pool(16) ## Map function over print('4\n') start = time.time() resParallel = pool4.map(getAreaSlowTuples, tIris) end = time.time() print(f'time: {end-start:.4f}s\n') ## 8 print('8\n') start = time.time() resParallel = pool8.map(getAreaSlowTuples, tIris) end = time.time()
def reflection_loss(data=None, f_set=None, d_set=None, **kwargs): """ The reflection_loss (RL) function calculates the RL based on the mapping passed through as the grid variable, done either through multiprocessing or through the python built-in map() function. The RL function always uses the interpolation function, even though as the function passes through the points associated with the input data, solving for the function at the associated frequencies yields the data point. This is simply for simplicity. ref: https://doi.org/10.1016/j.jmat.2019.07.003 --------------------------------------- :: :param data: (data) Permittivity and Permeability data of Nx5 dimensions. Can be a string equivalent to the directory and file name of either a .csv or .xlsx of Nx5 dimensions. Text above and below data array will be automatically avoided by the program (most network analysis instruments report data which is compatible with the required format) --------------------------------------- :: :param f_set: (start, end, [step]) tuple for frequency values in GHz - if given as list of len 3, results are interpolated - if given as list of len 2, results are data-derived with the calculation bound by the given start and end frequencies - if f_set is None, frequency is bound to input data --------------------------------------- :: :param d_set: (start, end, step) tuple for thickness values in mm. - if d_set is of type list, then the thickness values calculated will only be of the values present in the list. --------------------------------------- :: :param kwargs: interp= ('cubic'); 'linear' Method for interpolation. Set to linear if user wants to linear interp instead of cubic spline. Default action uses cubic spline. --------------------------------------- :: :param kwargs: override= (None); 'chi zero', 'eps set' provides response simulation functionality within libRL, common for discerning which EM parameters are casual for reflection loss. 'chi zero' sets mu = (1 - j*0). 'eps set' sets epsilon = (avg(e1)-j*0). --------------------------------------- :: :param kwargs: multiprocessing= (False); True, 0, 1, 2, ... Method for activating multiprocessing functionality for faster run times. This kwarg takes integers and booleans. Set variable to True or 0 to use all available nodes. Pass an integer value to use (int) nodes. Will properly handle 'False' as an input though it's equivalent to not even designating the particular kwarg. NOTE: if you use the multiprocessing functionality herein while on a Windows computer you ***MUST MUST MUST MUST*** provide main module protection via the :code:`if __name__ == "__main__":` conditional so to negate infinite spawns. --------------------------------------- :: :param kwargs: quick_graph= (False); True, str() Saves a *.png graphical image to a specified location. If set to True, the quick_graph function saves the resulting graphical image to the location of the input data as defined by the data input (assuming that the data was input via a location string. If not, True throws an assertion error). The raw string of a file location can also be passed as the str() argument, if utilized then the function will save the graph at the specified location. --------------------------------------- :: :param kwargs: as_dataframe=: (False); True returns data in a pandas dataframe. This is particularly useful if multicolumn is also set to true. --------------------------------------- :: :param kwargs: multicolumn=: (False); True outputs data in multicolumn form with a numpy array of [RL, f, d] iterated over each of the three columns. - if as_dataframe is used, then return value will be a pandas dataframe with columns of name d and indexes of name f. --------------------------------------- :: :return: [RL, f, d] returns Nx3 data set of [RL, f, d] by default - if multicolumn=True, an NxM dataframe with N rows for the input frequency values and M columns for the input thickness values, with pandas dataframe headers/indexes of value f/d respectively. """ # data is refactored into a Nx5 numpy array by the file_refactor # function from 'refactoring.py' if 'quick_graph' in kwargs and kwargs['quick_graph'] is True: kwargs['quick_graph'] = refactoring.qgref(data) data = refactoring.file_refactor(data) # acquire the desired interpolating functions from 'refactoring.py' e1f, e2f, mu1f, mu2f = refactoring.interpolate(data, **kwargs) # refactor the data sets in accordance to refactoring protocols # in 'refactoring.py' f_set = refactoring.f_set_ref(f_set, data) d_set = refactoring.d_set_ref(d_set) # construct a data grid for mapping from refactored data sets # d *must* be first as list comprehension cycles through f_set # for each d value, and this is deterministic of the structure # of the resultant. grid = array([(m, n) for n in d_set for m in f_set], dtype=float64) # just a constant j = cmath.sqrt(-1) def gamma(grid): f = grid[0] d = grid[1] # I know, it's super ugly. y = (20 * cmath.log10( (abs(((1 * (cmath.sqrt((mu1f(f) - j * mu2f(f)) / (e1f(f) - cmath.sqrt(-1) * e2f(f)))) * (cmath.tanh(j * (2 * cmath.pi * (f * 10**9) * (d * 0.001) / 299792458) * cmath.sqrt( (mu1f(f) - j * mu2f(f)) * (e1f(f) - j * e2f(f)))))) - 1) / ((1 * (cmath.sqrt( (mu1f(f) - j * mu2f(f)) / (e1f(f) - j * e2f(f)))) * (cmath.tanh(j * (2 * cmath.pi * (f * 10**9) * (d * 0.001) / 299792458) * cmath.sqrt( (mu1f(f) - j * mu2f(f)) * (e1f(f) - j * e2f(f)))))) + 1))))) # return inputted data for documentation and return # the real portion of y to drop complex portion # of form j*0 return y.real, f, d # if multiprocessing is given as True or as # a zero integer, use all available nodes # if multiprocessing is given and is a non-zero # integer, use int value for number of nodes # if multiprocessing is given as False (for some # reason?), or anything else, ignore it. # returns res of Zx3 data where Z is the product # of len(f_set) and len(d_set) if 'multiprocessing' in kwargs and isinstance(kwargs['multiprocessing'], int) is True: if kwargs['multiprocessing'] is True or kwargs['multiprocessing'] is 0: res = array(Pool().map(gamma, grid)) elif kwargs['multiprocessing'] > 0: res = array(Pool(nodes=kwargs['multiprocessing']).map(gamma, grid)) else: res = array(list(map(gamma, grid))) else: res = array(list(map(gamma, grid))) # takes data derived from computation and the file directory string and # generates a graphical image at the at location. if 'quick_graph' in kwargs and isinstance(kwargs['quick_graph'], str) is True: quick_graphs.quick_graph_reflection_loss( results=res, location=kwargs['quick_graph']) # formatting option, sometimes professors # like 3 columns for each thickness value if 'multicolumn' in kwargs and kwargs['multicolumn'] is True: # get frequency values from grid so # to normalize the procedure due to the # various frequency input methods gridInt = int(grid.shape[0] / d_set.shape[0]) # zero-array of NxM where N is the frequency # values and M is 3 times the # number of thickness values MCres = zeros((gridInt, d_set.shape[0] * 3)) # map the Zx3 result array to the NxM array for i in arange(int(MCres.shape[1] / 3)): MCres[:, 3 * i:3 * i + 3] = res[i * gridInt:(i + 1) * gridInt, 0:3] # stick the MultiColumn Array in the place of the results array res = MCres if 'as_dataframe' in kwargs and kwargs['as_dataframe'] is True: if 'multicolumn' in kwargs and kwargs['multicolumn'] is True: res = DataFrame(res[:, ::3]) res.columns = list(d_set) res.index = list(f_set) else: res = DataFrame(res) res.columns = ['RL', 'f', 'd'] return res
def germline_filter(processes, germline_path, cells_path, metadata_path, out_path): """ filter out common SNPs/indels between germline samples and samples of interest """ germline_path = Path(germline_path) cells_path = Path(cells_path) metadata_path = Path(metadata_path) out_path = Path(out_path) metadata_df = pd.read_csv(metadata_path) # Create a set of all patient IDs from the metadata file. all_patient_ids = set(metadata_df["patient_id"]) def process_patient(patient_id): # Find all non-tumor bulk VCF files for the patient ID. germline_wb_vcf_paths = list( germline_path.glob(patient_id + "_*_*.vcf")) # Fetch all cell IDs associated with the patient ID. cell_ids = metadata_df.loc[metadata_df["patient_id"] == patient_id]["cell_id"] # Use the cell IDs to create a list of all single-cell VCF files for the patient. cell_vcf_paths = [(cells_path / cell_id).with_suffix(".vcf") for cell_id in cell_ids] # Create a genome interval tree for the patient's germline bulk VCF # data. Only selects one germline VCF to avoid over-filtering for # patients with multiple germline VCFs. germline_tree = create_germline_genome_tree(germline_wb_vcf_paths[0:1]) def process_cell(cell_vcf_path): if not cell_vcf_path.exists(): return # If there were any germline VCFs for this patient, append `GF_` to # the file name to indicate that the output VCF was # germline-filtered, not just dbSNP-filtered. out_name_prefix = "" if len(germline_wb_vcf_paths) < 1 else "GF_" out_vcf_path = out_path / (out_name_prefix + cell_vcf_path.name) with open(cell_vcf_path, mode='r') as in_file: with open(out_vcf_path, mode='w') as out_file: write_filtered_vcf(in_file, germline_tree, out_file) # TODO: Maybe remove this in Python 3.8. # This thread pool max-worker count is from the implementation in # Python 3.8. Assuming that Pathos adopts the same semantics, # this can be removed. with ThreadPool(min(32, os.cpu_count() + 4)) as pool: pool.map(process_cell, cell_vcf_paths) print("Running germline filter...") if processes > 1: with Pool(processes) as pool: list( tqdm(pool.imap(process_patient, all_patient_ids), total=len(all_patient_ids), smoothing=0.01)) else: list(map(process_patient, tqdm(all_patient_ids, smoothing=0.1))) print("Done!")
def use_smac(emotion): scenario = Scenario({ 'run_obj': 'quality', 'runcount-limit': 200, "cs": make_cs(), "deterministic": "true", "shared_model": True, "input_psmac_dirs": "smac3-output*", "seed": np.random.RandomState() }) smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=forest_from_cfg) incumbent = smac.optimize() # joblib.dump(RandomForestClassifier(**incumbent), '{0}_smac_optimized_random_forest.pkl'.format(emotion)) inc_value = forest_from_cfg(incumbent, emotion) out_writer.write("Optimized Value for {0}: {1}".format(emotion, inc_value)) out_writer.write('\n' + '\n') out_writer.write(incumbent) if __name__ == '__main__': OpenDir = sys.argv[sys.argv.index('-d') + 1] os.chdir(OpenDir) print("Optimizing") out_file = 'smac.txt' with open(out_file) as out_writer: Pool(len(emotion_list())).map(use_smac, emotion_list())
def _power_bandwidth_variance(spectral_data, l0, dl, w, ncores=8): ''' Calculated the power bandwidth optimization parameter for the given spectral data. Args: spectral_data : the spectrum to calculate Delta^op for, two columns, first column is wavelength, second column is normalized spectral data. Will calculate Delta^op over the whole range of the spectrum. Spectrum should be fairly free of noise, filter noisy data first. w : a numpy array containing the peak widths to calculate lstep : the resolution of lambda_0 in nm, default 2 nm dlmin : the minimum Delta lambda to calculate dlmax :the maximum Delta lambda to calculate dlN : the number of values to calculate Delta lambda for Returns: An array with parameters and calculated values in the form: [l0, dl, w, Delta^op] Where l0, dl, w are all parameters, and Delta^op is the power bandwidth optimization parameter respectively as numpy arrays with dl for rows, l0 for columns and w for the third axis (2D array is only one values of w is given) ''' sx = spectral_data[:, 0] sy = spectral_data[:, 1] cols = len(l0) rows = len(dl) N = len(w) ua = np.zeros((rows, cols, N)) ub = np.zeros((rows, cols, N)) du = np.zeros((rows, cols, N)) spectrum = interp1d(sx, sy, kind='cubic', bounds_error=False, fill_value=np.min(sy)) args_array = [] for i in range(rows): args_array.append([]) for j in range(cols): args_array[i].append([l0[j], dl[i]]) int = _integrator(w[0], spectrum) # Loop over values of w t0 = timer() for i in range(N): ts = timer() int.setw(w[i]) ua[:, :, i] = _multiprocess2D(int.ua_integral, args_array, ncores=ncores, display=False) ub[:, :, i] = _multiprocess2D(int.ub_integral, args_array, ncores=ncores, display=False) for ii in range(rows): for jj in range(cols): du[ii, jj, i] = np.abs(ua[ii, jj, i] - ub[ii, jj, i]) tf = timer() _print( str(i + 1) + '/' + str(N) + ' complete ' + str(datetime.timedelta(seconds=tf - ts))) Pool(nodes=ncores).clear( ) # Because pathos is designed to leave Pools running, and sometimes doesn't get rid of them after the caluculation is complete _print('Calculations Complete in ' + str(datetime.timedelta(seconds=tf - t0))) return [l0, dl, w, du]
def fit(self, X, y): """Extract shapelets from the provided timeseries and labels. Parameters ---------- X : array-like, shape = [n_ts, ] The training input timeseries. Each timeseries must be an array, but the lengths can be variable y : array-like, shape = [n_samples] The target values. """ # If y is a 1D list, convert it to a 2D column np array if type(y) is list or len(y.shape) == 1: y = np.reshape(y, (-1, 1)) # Sci-kit learn checks check_array(X) check_array(y) # Determine the minimum and maximum shapelet length min_len = 4 max_len = min([len(x) for x in X]) # We will try to maximize the negative logloss of LR in CV. # In the case of ties, we pick the one with least number of shapelets weights = (1.0, -1.0) creator.create("FitnessMax", base.Fitness, weights=weights) # Individual are lists (of shapelets (list)) creator.create("Individual", list, fitness=creator.FitnessMax) def random_shapelet(n_shapelets): """Extract a random subseries from the training set""" shaps = [] for _ in range(n_shapelets): rand_row = np.random.randint(X.shape[0]) rand_length = np.random.randint(min_len, max_len) rand_col = np.random.randint(X.shape[1] - rand_length) shaps.append(X[rand_row, rand_col:rand_col + rand_length]) if n_shapelets > 1: return np.array(shaps) else: return np.array(shaps[0]) def motif(n_shapelets, n_draw=100): """Extract some motifs from sampled timeseries""" shaps = [] for _ in range(n_shapelets): rand_length = np.random.randint(min_len, max_len) subset_idx = np.random.choice(range(len(X)), size=n_draw, replace=True) ts = X[subset_idx, :].flatten() matrix_profile, _ = mstamp_stomp(ts, rand_length) motif_idx = matrix_profile[0, :].argsort()[-1] shaps.append(ts[motif_idx:motif_idx + rand_length]) if n_shapelets > 1: return np.array(shaps) else: return np.array(shaps[0]) def kmeans(n_shapelets, shp_len, n_draw=1000): """Sample subseries from the timeseries and apply K-Means on them""" # Sample `n_draw` subseries of length `shp_len` n_ts, sz = X.shape indices_ts = np.random.choice(n_ts, size=n_draw, replace=True) start_idx = np.random.choice(sz - shp_len + 1, size=n_draw, replace=True) end_idx = start_idx + shp_len subseries = np.zeros((n_draw, shp_len)) for i in range(n_draw): subseries[i] = X[indices_ts[i], start_idx[i]:end_idx[i]] tskm = TimeSeriesKMeans(n_clusters=n_shapelets, metric="euclidean", verbose=False) return tskm.fit(subseries).cluster_centers_[0] def create_individual(n_shapelets=None): """ Generate a random shapelet set """ if n_shapelets is None: n_shapelets = 1 rand = np.random.random() if rand < 1. / 3.: return [motif(n_shapelets)] elif 1. / 3. < rand < 2. / 3.: return [ kmeans(n_shapelets, np.random.randint(min_len, max_len)) ] else: return [random_shapelet(n_shapelets)] def cost(shapelets): """ Calculate the fitness of an individual/shapelet set""" start = time.time() D = np.zeros((len(X), len(shapelets))) for k in range(len(X)): ts = X[k, :] for j in range(len(shapelets)): if self.normed: dist = util.sdist(shapelets[j].flatten(), ts) else: dist = util.sdist_no_norm(shapelets[j].flatten(), ts) D[k, j] = dist lr = LogisticRegression() skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1337) preds = cross_val_predict(lr, D, y, method='predict_proba', cv=skf) cv_score = -log_loss(y, preds) return (cv_score, sum([len(x) for x in shapelets])) def add_noise(shapelets): """Add random noise to a random shapelet""" rand_shapelet = np.random.randint(len(shapelets)) tools.mutGaussian(shapelets[rand_shapelet], mu=0, sigma=0.1, indpb=0.15) return shapelets, def remove_shapelet(shapelets): """Remove a random shapelet from the individual""" if len(shapelets) > 1: rand_shapelet = np.random.randint(len(shapelets)) shapelets.pop(rand_shapelet) return shapelets, def merge_crossover(ind1, ind2): """ Merge shapelets from one set with shapelets from the other """ # Construct a pairwise similarity matrix using GAK _all = list(ind1) + list(ind2) similarity_matrix = cdist_gak(ind1, ind2, sigma=sigma_gak(_all)) # Iterate over shapelets in `ind1` and merge them with shapelets # from `ind2` for row_idx in range(similarity_matrix.shape[0]): # Remove all elements equal to 1.0 mask = similarity_matrix[row_idx, :] != 1.0 non_equals = similarity_matrix[row_idx, :][mask] if len(non_equals): # Get the timeseries most similar to the one at row_idx max_col_idx = np.argmax(non_equals) ts1 = list(ind1[row_idx]).copy() ts2 = list(ind2[max_col_idx]).copy() # Merge them and remove nans ind1[row_idx] = euclidean_barycenter([ts1, ts2]) ind1[row_idx] = ind1[row_idx][~np.isnan(ind1[row_idx])] # Apply the same for the elements in ind2 for col_idx in range(similarity_matrix.shape[1]): mask = similarity_matrix[:, col_idx] != 1.0 non_equals = similarity_matrix[:, col_idx][mask] if len(non_equals): max_row_idx = np.argmax(non_equals) ts1 = list(ind1[max_row_idx]).copy() ts2 = list(ind2[col_idx]).copy() ind2[col_idx] = euclidean_barycenter([ts1, ts2]) ind2[col_idx] = ind2[col_idx][~np.isnan(ind2[col_idx])] return ind1, ind2 def point_crossover(ind1, ind2): """ Apply one- or two-point crossover on the shapelet sets """ if len(ind1) > 1 and len(ind2) > 1: if np.random.random() < 0.5: ind1, ind2 = tools.cxOnePoint(list(ind1), list(ind2)) else: ind1, ind2 = tools.cxTwoPoint(list(ind1), list(ind2)) return ind1, ind2 # Register all operations in the toolbox toolbox = base.Toolbox() if self.n_jobs > 1: pool = Pool(self.n_jobs) toolbox.register("map", pool.map) else: toolbox.register("map", map) # Register all our operations to the DEAP toolbox toolbox.register("merge", merge_crossover) toolbox.register("cx", point_crossover) toolbox.register("mutate", add_noise) toolbox.register("remove", remove_shapelet) toolbox.register("individual", tools.initIterate, creator.Individual, create_individual) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", cost) # Small tournaments to ensure diversity toolbox.register("select", tools.selTournament, tournsize=3) # Set up the statistics. We will measure the mean, std dev and max stats = tools.Statistics(key=lambda ind: ind.fitness.values[0]) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("max", np.max) # Initialize the population and calculate their initial fitness values pop = toolbox.population(n=self.population_size) fitnesses = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit # Keep track of the best iteration, in order to do stop after `wait` # generations without improvement it, best_it = 1, 1 best_ind = [] best_score = float('-inf') # Set up a matplotlib figure and set the axes height = int(np.ceil(self.population_size / 4)) if self.plot is not None and self.plot != 'notebook': if self.population_size <= 20: f, ax = plt.subplots(4, height, sharex=True) else: plt.figure(figsize=(15, 5)) plt.xlim([0, len(X[0])]) # The genetic algorithm starts here while it <= self.iterations and it - best_it < self.wait: gen_start = time.time() # Clone the population into offspring offspring = list(map(toolbox.clone, pop)) # Plot the fittest individual of our population if self.plot is not None: if self.population_size <= 20: if self.plot == 'notebook': f, ax = plt.subplots(4, height, sharex=True) for ix, ind in enumerate(offspring): ax[ix // height][ix % height].clear() for s in ind: ax[ix // height][ix % height].plot( range(len(s)), s) plt.pause(0.001) if self.plot == 'notebook': plt.show() else: plt.clf() for shap in best_ind: plt.plot(range(len(shap)), shap) plt.pause(0.001) # Iterate over all individuals and apply CX with certain prob for child1, child2 in zip(offspring[::2], offspring[1::2]): try: if np.random.random() < self.crossover_prob: toolbox.merge(child1, child2) del child1.fitness.values del child2.fitness.values if np.random.random() < self.crossover_prob: toolbox.cx(child1, child2) del child1.fitness.values del child2.fitness.values except: raise # Apply mutation to each individual for idx, indiv in enumerate(offspring): if np.random.random() < self.add_noise_prob: toolbox.mutate(indiv) del indiv.fitness.values if np.random.random() < self.remove_shapelet_prob: toolbox.remove(indiv) del indiv.fitness.values # Update the fitness values invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Replace population and update hall of fame & statistics new_pop = toolbox.select(offspring, self.population_size - 1) fittest_ind = tools.selBest(pop + offspring, 1) pop[:] = new_pop + fittest_ind it_stats = stats.compile(pop) # Print our statistics if self.verbose: if it == 1: print('it\t\tavg\t\tstd\t\tmax\t\ttime') print('{}\t\t{}\t\t{}\t\t{}\t{}'.format( it, np.around(it_stats['avg'], 4), np.around(it_stats['std'], 3), np.around(it_stats['max'], 6), np.around(time.time() - gen_start, 4), )) # Have we found a new best score? if it_stats['max'] > best_score: best_it = it best_score = it_stats['max'] best_ind = tools.selBest(pop + offspring, 1) it += 1 self.shapelets = np.array(best_ind[0])
# Make all of the folders for the tile for y in range(16): for x in range(16): os.makedirs(path + "/Tile" + str(tileNum) + "_" + str(y) + "_" + str(x), exist_ok=True) # Generate the new chunks for file in files: # Load image with 50px padding img = cv2.imread(file[0], -1) img = cv2.copyMakeBorder(img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) for y in range(16): for x in range(16): i = (y * 50, x * 50) newImgPath = path + "/Tile" + str(tileNum) + "_" + str( y) + "_" + str(x) + "/" + file[1] split = img[i[0]:i[0] + NEWHEIGHT, i[1]:i[1] + NEWWIDTH] cv2.imwrite(newImgPath, split) if CREATE: p = Pool(NUMPROCESS) results = p.map(lambda a: processImg(a, "/dfc2021_dse_train/Train"), list(range(1, 61))) results = p.map(lambda a: processImg(a, "/dfc2021_dse_val/Val"), list(range(1, 20)))
def get_coin_buy_prices(api_urls: list): coins = list(get_coinnames( )) # list changes infrequently, but may occasioonallly cause problems exchangerates = {} unfinished = [ ] # urls that werent attempted due to a remote server disconnect blacklist = [] results = [] # TODO - Work on a proper add_coin_price progress algo def add_coin_price(url): exchangerate = '' #declared up a scope! # i = 0 base = url[str(url).rfind('/') + 1:str(url).rfind( '-')] # rfinds start from 0 quirk to the rescue! try: # i += 1 print(datetime.datetime.now(), 'Progress ', 'url is', url) cachepath = 'cache/' + url[url.rfind('/') + 1:] with open(cachepath) as cache: print('Using Cache', cachepath) try: exchangerate = json.loads(cache.read().decode()) except AttributeError as isstr: exchangerate = json.loads(cache.read()) # exchangerate = json.loads(cache.read() if type(cache.read()) == str() else cache.read().decode() ) if cache.readable() else scraper.get(url).json() exchangerate = exchangerate['ticker']['buy'] print('Base Is', base, 'Exchangerate is', exchangerate) except JSONDecodeError: exchangerate = 0.0 return {'blacklist': url} """ except (RemoteDisconnected,ProtocolError,ConnectionError): # error cascade below! argh! # possible subexceptions go here. try: exchangerate = 0.0 return {'TODO':url} except ProtocolError: try: exchangerate = 0.0 return {'TODO': url} except ConnectionError: try: exchangerate = 0.0 return {'TODO': url} except: try: exchangerate = 0.0 return {'TODO': url} except Exception as e: exchangerate = 0.0 return {'TODO': url} """ return {str(base): [{str(url): float(exchangerate)}]} with Pool() as p: # parse the urls for prices results = p.map(add_coin_price, api_urls) # exchangerates = dict(ChainMap(*results))# *results print(results) # build the json file, by collating the "results" list of dicts into a json dict by key. def collate_by_coin(dict_key): flattened_dict = {dict_key: []} for result in results: try: if result[dict_key]: pair = str(result.values[0])[ str(result.values[0]).rfind('-') + 1:str(result.values[0]).rfind('.') - 1] # get the crossex pair from a url. flattened_dict[dict_key].append({pair: result.values()[1]}) except Exception as e: # key error .. continue to next iteration continue return dict(flattened_dict) def collate_by_unfinished(result: dict): try: # print('Unfinished Run Is',result) # uncomment for debug if 'TODO' in result: print(result) return result['TODO'] except Exception as e: print('unfinished result is', result) print(e.with_traceback()) def collate_by_blacklist(result: dict): try: # print('Unfinished Run Is',result) # uncomment for debug if 'blacklist' in result: print(result) return result['blacklist'] except Exception as e: print('blacklist result is', result) print(e.with_traceback()) # TODO - finish ccex.json with Pool() as p: # coins = list(dict(ChainMap(*results)).keys()) exchangerateslist = p.map(collate_by_coin, coins) for exchangeratedict in exchangerateslist: exchangerates.update(exchangeratedict) # Collate By Unfinished with Pool() as p: unfinished = p.map(collate_by_unfinished, results) # def with Pool() as p: blacklist = p.map(collate_by_blacklist, results) return { 'exchangerates': exchangerates, 'unfinished': unfinished, 'blacklist': blacklist }
def main(): time0 = time.time() parser = ArgumentParser() parser.add_argument( '--years', dest='s_years', action='store', type=str, help='Give a list of years as a string, such as "1980,1981". Optional.' ) parser.add_argument('--local', dest='do_local', action='store_true', default=False, help='Check for locally running plex server.') parser.add_argument( '--dirname', dest='dirname', action='store', type=str, default=os.getcwd(), help='Directory into which to store those plots. Default is %s.' % os.getcwd()) parser.add_argument('--noverify', dest='do_verify', action='store_false', default=True, help='If chosen, do not verify SSL connections.') args = parser.parse_args() # ## function to do the processing step = 0 print('%d, started on %s' % (step, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p'))) if args.s_years is not None: try: years = sorted( set(map(lambda tok: int(tok), args.s_years.split(',')))) except: step += 1 print('%d, did not give a valid set of years.' % step) years = [] else: years = [] # ## get plex server token dat = core.checkServerCredentials(doLocal=args.do_local, verify=args.do_verify) if dat is None: step += 1 print('\n'.join([ '%d, error, could not access local Plex server in %0.3f seconds. Exiting...' % (step, time.time() - time0), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return fullURL, token = dat # ## first find out which libraries are the TV show ones library_dict = core.get_libraries(token, fullURL=fullURL, do_full=True) if library_dict is None: step += 1 print('\n'.join([ '%d, error, could not access libraries in plex server in %0.3f seconds. Exiting...' % (step, time.time() - time0), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return # valid_keys = list( filter(lambda key: library_dict[key][-1] == 'show', library_dict)) if len(valid_keys) == 0: step += 1 print('\n'.join([ '%d, Error, could not find a TV show library in %0.3f seconds. Exiting...' % (time.time() - time0, step), '%d, finished on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ])) return tvlib_title = library_dict[max(valid_keys)][0] step += 1 print('%d, found TV library: %s.' % (step, tvlib_title)) # ## now get the TV shows tvdata = core.get_library_data(tvlib_title, token=token, fullURL=fullURL, num_threads=16) showsToExclude = tv.get_shows_to_exclude(tvdata) if len(showsToExclude) != 0: step += 1 print('%d, excluding these TV shows: %s.' % (step, '; '.join(showsToExclude))) # ## now actual meat of the computation tvdata_date_dict = tv.get_tvdata_ordered_by_date(tvdata) min_year = min(tvdata_date_dict.keys()).year max_year = max(tvdata_date_dict.keys()).year possible_years_set = set(map(lambda date: date.year, tvdata_date_dict)) step += 1 if len(years) == 0: years = sorted(possible_years_set) print('%d, no years specified. We will use %s total: %s.' % (step, _print_years(len(years)), ', '.join( map(lambda year: '%d' % year, years)))) else: cand_years = sorted(set(years) & possible_years_set) if len(cand_years) == 0: print('\n'.join([ '%d, no intersection between the %s chosen (%s) and the %d years in the library.' % (step, _print_years(len(years)), ', '.join( lambda yr: '%d' % year, years), len(possible_years_set)), 'Instead, we will use %s total: %s.' % (_print_years(len(possible_years_set)), ', '.join( map(lambda year: '%d' % year, sorted(possible_years_set)))) ])) years = sorted(possible_years_set) else: print('%d, we found %s to use: %s.' % (step, _print_years(len(cand_years)), ', '.join( map(lambda year: '%d' % year, cand_years)))) years = cand_years step += 1 print('%d, started processing %s of TV shows after %0.3f seconds.' % (step, _print_years(len(years)), time.time() - time0)) manager = Manager() shared_step = manager.Value('step', step) num_procced = manager.Value('nump', 0) lock = manager.RLock() pool = Pool(processes=cpu_count()) def _process_year(year): tv.create_plot_year_tvdata(tvdata_date_dict, year, shouldPlot=True, dirname=args.dirname) lock.acquire() shared_step.value += 1 num_procced.value += 1 print( '%d, finished processing year = %d (%02d / %02d) in %0.3f seconds.' % (shared_step.value, year, num_procced.value, len(years), time.time() - time0)) lock.release() _ = list(pool.map(_process_year, years)) step = shared_step.value + 1 print('\n'.join([ '%d, processed all %s in %0.3f seconds.' % (step, _print_years(len(years)), time.time() - time0), '%d, finished everything on %s.' % (step + 1, datetime.datetime.now().strftime('%B %d, %Y @ %I:%M:%S %p')) ]))
else: return None if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--scanner', dest='scanner', action='store', nargs=1, default=None, help='Run N4 correction on flair and t1 images from specific scanner (mg or uc)') args = parser.parse_args() scanner=args.scanner[0] scan_keys = get_scans('subjects') if scanner == 'mg': commands = [bias_correct(scan_key + '/FLAIR-corrected.nii.gz', scanner, scan_key) for scan_key in scan_keys] commands.extend([bias_correct(scan_key + '/MPRAGE-corrected-reorn.nii.gz', scanner, scan_key) for scan_key in scan_keys]) elif scanner == 'uc': commands = [bias_correct('/home/shengwei/work/vbm/uc/flair/raw/' + scan_key + '.nii.gz', scanner, scan_key) for scan_key in scan_keys] commands.extend([bias_correct('/media/shengwei/BackupData/fmri/uc/150908/' \ + scan_key + '/t1-reorient.nii.gz', scanner, scan_key) for scan_key in scan_keys]) else: print('Type -h for usage, exiting...') exit(1) with Pool(cpu_count()-2) as pool: for _ in tqdm(pool.imap(system, commands), total = len(commands)): pass
def ks_sampling_mem(X, seed=None, n_result=None, n_proc=4, n_batch=1000): """ ks_sampling_mem(X, seed=None, n_result=None, n_proc=4, n_batch=1000) Kennard-Stone Full Sampling Program (with limited memory) If user have enough memory space, using `ks_sampling` instead of `ks_sampling_mem` is strongly recommended. This program could possibly handle very large dataset. To make memory cost as low as possible, `n_batch` could be set to about sqrt(X.shape[0]) manually. However, to make efficiency as the first priority, `n_batch` could be set to as large as possible. NOTE! Only Euclid distance is available currently! Parameters ---------- X: np.ndarray, shape: (n_sample, n_feature) Original data, need to be generated by user. seed: np.ndarray or list or None, shape: (n_seed, ), optional Initial selected seed. If set as `None`, the program will find the two samples which have largest distance as seed. n_result: int or None, optional Number of samples that should be selected. If set as `None`, `n_sample` will be used instead, i.e. selectet all data. n_proc: int, optional Number of Python's multiprocessing processors. NOTE! This variable only controls Python's code. NOTE! Only used in finding maximum distance! Not in KS sampling. n_batch: int, optional The dimension of distance matrix evaluation in one processor. """ X = np.asarray(X, dtype=float) n_sample = X.shape[0] if n_result is None: n_result = X.shape[0] # Find most distant sample indexes if no seed provided if seed is None or len(seed) == 0: t = np.einsum("ia, ia -> i", X, X) def get_dist_slice(sliceA, sliceB): distAB = t[sliceA, None] - 2 * X[sliceA] @ X[sliceB].T + t[None, sliceB] if sliceA == sliceB: np.fill_diagonal(distAB, 0) return np.sqrt(distAB) def get_maxloc_slice(slice_pair): dist_slice = get_dist_slice(slice_pair[0], slice_pair[1]) max_indexes = np.unravel_index(np.argmax(dist_slice), dist_slice.shape) return (dist_slice[max_indexes], max_indexes[0] + slice_pair[0].start, max_indexes[1] + slice_pair[1].start) p = list(np.arange(0, n_sample, n_batch)) + [n_sample] slices = [slice(p[i], p[i + 1]) for i in range(len(p) - 1)] slice_pairs = [(slices[i], slices[j]) for i in range(len(slices)) for j in range(len(slices)) if i <= j] with Pool(n_proc) as p: maxloc_slice_list = p.map(get_maxloc_slice, slice_pairs) max_indexes = maxloc_slice_list[np.argmax( [v[0] for v in maxloc_slice_list])][1:] seed = max_indexes seed = np.asarray(seed, dtype=np.uintp) return ks_sampling_mem_core(X, seed, n_result)
def prop_exploit_scrobbles(fi): blocks = pd.read_pickle(fi)['block'] cnts = pd.DataFrame({'n':blocks.value_counts().sort_index()}) cnts['last-n'] = cnts['n'].shift(1) cnts['switch'] = cnts.apply(lambda row: 1 if ((row['last-n']==1) and (row['n']>1)) or ((row['last-n']>1) and (row['n']==1)) else 0,axis=1) cnts['exp-idx'] = cnts['switch'].cumsum() result = cnts.groupby('exp-idx').apply(lambda grp: pd.Series({'n':len(grp),'exploit':0}) if grp['n'].iloc[0]==1 else pd.Series({'n':grp['n'].sum(),'exploit':1})) exploit = result[result['exploit']==1]['n'].sum() explore = result[result['exploit']==0]['n'].sum() return (explore,exploit) pool = Pool(cpu_count()) start = time.time() result_patches = pool.map(prop_exploit_patches,files_patches) print (time.time() - start)/60. start = time.time() result_scrobbles = pool.map(prop_exploit_scrobbles,files_scrobbles) print (time.time() - start)/60. result_patches = np.vstack(result_patches) result_scrobbles = np.vstack(result_scrobbles) np.save('ee_count_patches.npy',result_patches) np.save('ee_count_scrobbles.npy',result_scrobbles)
list_link = goodreads_link + list_name total_pages = get_last_page_num(list_link) p = 165 book_db_file = "goodreads_list_props.csv" #os.remove(book_ratings_file_name) if not os.path.exists(book_db_file): with open(book_db_file, 'w') as f: f.write( "book_name,author,rating,votes,description,book_type,no_of_pages,first_published,isbn13,genre,link\n" ) book_ratings_db = pd.read_csv(book_db_file, sep=",", quotechar="\"") for p in range(410, total_pages): page_id = '' if p == 0 else "?page=" + str(p + 1) current_link = list_link + page_id print(current_link) all_links = request_and_find_type(current_link, "a") all_books = list( set(search_for_text(all_links, "\"(/book/show/.*?)\""))) all_book_links = ["https://www.goodreads.com/" + x for x in all_books] # [process_book(x, book_ratings_db, book_db_file) for x in all_book_links] pool = Pool(4) list( pool.map(lambda x: process_book(x, book_ratings_db, book_db_file), all_book_links)) # list(map(lambda x: process_book(x, book_ratings_db, book_db_file), all_book_links)) current_book_link = search_string = "https://www.goodreads.com//book/show/20578795-meditation-as-a-way-of-life"