def multi_word_cut(self, sentences): print('Multiprocessing Word cut ') if self.language == 'ch': jieba.initialize( ) # initialize first, or it will initialize in each process jieba.disable_parallel() def func(line): line = [i.strip() for i in jieba.cut(line, cut_all=False)] return [ i for i in line if ((not i.isdigit()) and (i not in self.stop_words)) ] else: def func(line): return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \ (i not in self.stop_words) and \ (len(i) >1 ) )] pool = Pool(nodes=5) t0 = time.time() word_cut = pool.map(func, sentences) pool.close() pool.join() pool.clear() print('MultiProcess time {:.0f}'.format(time.time() - t0)) return word_cut
def GroupByParallelProcess(tweetsDF, cores, groupMethod): """ Group by and aggregate on time via a parallel process """ tweetsDF.label_date = tweetsDF.label_date.astype(int) tweetsDF = tweetsDF.set_index("label_date") # Parallelizing using Pool.apply() df_split = GetListOfSplitDFs(tweetsDF, cores) # create the multiprocessing pool pool = Pool(cores) # process the DataFrame by mapping function to each df across the pool logging.info("Starting the grouping and aggregating process.") if groupMethod == "weighted-average": df_out = pool.map(PerformGroupbyAndAggregate, df_split) elif groupMethod == "sum": df_out = pool.map(PerformSum, df_split) elif groupMethod == "mean": df_out = pool.map(PerformMean, df_split) else: logging.error("Choose correct group by method.") return None # close down the pool and join pool.close() pool.join() pool.clear() logging.info("Ended the grouping and aggregating process.") return df_out
def Pool(cpus=cpu_count()) -> ProcessingPool: """Context manager for pathos ProcessingPool""" # Creates a pool with processes p = ProcessingPool(cpus) yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def parallelize_dataframe(df, func, num_partitions=num_cores, num_cores=num_cores): df_split = np.array_split(df, num_partitions, axis=0) pool = Pool(num_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() pool.clear() return df
def Pool(threads: int, multiplier: int, name: str): """Context manager for pathos ProcessingPool""" # Creates a pool with threads else cpu_count * multiplier p = ProcessingPool(threads if threads else cpu_count() * multiplier) logging.debug(f"Created {name} pool") yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def parallel_apply(self, df, func): # add try statement re function not returning a DataFrame if self.preprocessing_checks(df, func): # split DataFrame into a list of smaller DataFrames self.df_split = np.array_split(df, self.partitions, axis=0) # create the multiprocessing pool pool = Pool(self.cores) # process the DataFrame by mapping function to each df across the pool df = pd.concat(pool.map(func, self.df_split), axis=0).copy() # close down the pool and join pool.close() pool.join() pool.clear() return df
def make_query(self, size=1): ## quit if nr_unlabeled_samples = 1 if self.dataset.len_unlabeled() == 1: return self.dataset.get_unlabeled_entries()[0].astype(int) ## Set the possible labels self.possible_labels = list(set(self.dataset.get_labeled_entries()[1])) ## Train the model self.model.train(self.dataset) ## Get probabilities X_ids, X = self.dataset.get_unlabeled_entries() pred = self.model.predict_proba( X) # pred.shape = (n_unlabeled, nr_of_labels) ## Setup pool for cpu parallelisation p = Pool(cpu_count(), maxtasksperchild=1000) ## nr of unlabeled samples -> len(X) ## Get uncertainty after adding every sample with every label total = np.asarray( p.map(self._eer, X_ids, len(X) * [self.dataset], len(X) * [self.depth])) # total.shape = (n_unlabeled, nr_of_labels) ## Close the Pool again p.close() p.join() p.clear() ## Get the total uncertainty of one sample after adding a label weighted by the labels probability total = np.inner( pred, total, ).diagonal() # total.shape = (n_unlabeled,) ## Zip it total = zipit(X_ids, total) ## Sort it results = sort_by_2nd(total, 'min') return results[:size, 0].astype(int)
def start(self, text_data_dir, res_dir, nprocs=8): ''' entry function text_data_dir: folder of raw data text_res_dir: folder of output verbose: int. Information is printed every N records nprocs: number of cores in parallel ''' p = PathosPool(nprocs) filepathsvec, filenamesvec, respaths = list(), list(), list() for dirpath, _, filenames in os.walk(text_data_dir): for filename in filenames: if (("gz" in filename) and ('md5' not in filename) and ('copy' not in filename)): filepath = os.path.join(dirpath, filename) print(filepath) res_name = filename.split(".")[0] + ".csv.gz" respath = os.path.join(res_dir, res_name) #if os.path.exists(respath): # pass #else: if True: filepathsvec.append(filepath) filenamesvec.append(filename) respaths.append(respath) #p.apply_async(process_data, args = (filepath,filename, # respath, True, # [title_stop_path, # affil_stop_path, # mesh_stop_path])) self.affildicts = p.amap( partial(self.process_data, stop_paths=[ self.title_stop_path, self.affil_stop_path, self.mesh_stop_path ], rm_stopwords=True, affiliation_correction=True, select_journals=self.select_journals), filepathsvec, filenamesvec, respaths) p.close() p.join() # Having an issue joining print("joined") p.clear() # Delete the pool
def run( self, percent_attackers_list=[x / 100 for x in range(1, 92, 5)], managers=Manager.paper_managers, attackers=Attacker.paper_attackers, # Note that for range, last number is not included num_buckets=1, # Note that this is the users per bucket, not total users users_per_bucket=10, num_rounds=2, trials=2): """Runs in parallel every possible scenario Looks complicated, but no real way to simplify it so deal with it""" p = ProcessingPool(nodes=cpu_count()) full_args = [[percent_attackers_list] * trials, [attackers] * trials, [num_buckets] * trials, [users_per_bucket] * trials, [num_rounds] * trials, [managers] * trials, list(range(trials)), [trials] * trials] # If we are debugging, no multiprocessing # https://stackoverflow.com/a/1987484/8903959 # https://stackoverflow.com/a/58866220/8903959 if self.debug or "PYTEST_CURRENT_TEST" in os.environ: results = [] for trial_num in range(trials): args = [x[trial_num] for x in full_args] results.append(self.get_combo_data(*args)) else: # Doesn't make sense to do tqdm here since they finish all at once results = p.map(self.get_combo_data, *full_args) p.close() p.join() p.clear() # Get rid of carriage returns print() return self._aggregate_results(results, managers, attackers, percent_attackers_list)
class PPool: """pathos multi-processing pool""" def __init__(self, processor_num: int = None, ): self.processor_num = cpu_count() if processor_num is None \ else min(processor_num, cpu_count()) LOGGER.debug('Building Pathos multi-processing pool with {} cores.'.format(self.processor_num)) self._pool = Pool(self.processor_num) def flatten_params(self, params: List): """params: List[*args, **kwargs]""" # block_size = int(math.ceil(len(params) / self.processor_num)) # block_num = int(math.ceil(len(params) / block_size)) block_size = (len(params) + self.processor_num - 1) // self.processor_num block_num = (len(params) + block_size - 1) // block_size block_params = [params[i * block_size:(i + 1) * block_size] for i in range(block_num)] return block_params def close(self): self._pool.close() self._pool.join() self._pool.clear() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def feed(self, func: Any, params: List, one_params: bool = False) -> List[Any]: if one_params: result = self._pool.amap(func, params).get() else: params = tuple(zip(*params)) result = self._pool.amap(func, *params).get() return result
def run(self, ddos_sim_cls_list=None, managers=Manager.runnable_managers, attackers=Attacker.runnable_attackers, # Note that for range, last number is not included num_buckets_list=[1], # Note that this is the users per bucket, not total users users_per_bucket_list=[10 ** i for i in range(4, 6)], num_rounds_list=[10 ** i for i in range(3, 5)], trials=10): """Runs in parallel every possible scenario Looks complicated, but no real way to simplify it so deal with it""" if ddos_sim_cls_list is None: ddos_sim_cls_list =\ [ddos_simulator.DDOS_Simulator.runnable_simulators[0]] # Initializes graph path self.make_graph_dir(destroy=True) # Total number of scenarios pbar_total = (len(ddos_sim_cls_list) * len(num_buckets_list) * len(users_per_bucket_list) * len(num_rounds_list) * (len(attackers) + 1)) # Add 1 to attacker for worst case _pathos_simulators_list = [] _pathos_num_buckets_list = [] _pathos_users_per_bucket = [] _pathos_num_rounds = [] for num_buckets in num_buckets_list: for users_per_bucket in users_per_bucket_list: for num_rounds in num_rounds_list: for attacker in attackers + [Worst_Case_Attacker]: for sim_cls in ddos_sim_cls_list: self.get_attacker_graph_dir(attacker) _pathos_simulators_list.append(sim_cls) _pathos_num_buckets_list.append(num_buckets) _pathos_users_per_bucket.append(users_per_bucket) _pathos_num_rounds.append(num_rounds) p = ProcessingPool(nodes=cpu_count()) total = len(_pathos_num_rounds) full_args = [_pathos_simulators_list, [attackers] * total, _pathos_num_buckets_list, _pathos_users_per_bucket, _pathos_num_rounds, [managers] * total, [trials] * total, list(range(total)), list([pbar_total] * total)] # If we are debugging, no multiprocessing # https://stackoverflow.com/a/1987484/8903959 if (self.stream_level == Log_Levels.DEBUG # https://stackoverflow.com/a/58866220/8903959 or "PYTEST_CURRENT_TEST" in os.environ): for i in range(total): try: current_args = [x[i] for x in full_args] self.get_graph_data(*current_args) except Exception as e: from pprint import pprint pprint(current_args) raise e else: p.map(self.get_graph_data, *full_args) p.close() p.join() p.clear() # Get rid of carriage returns print()
def create_sigmats_3_scales(dataset, no_sensors_cols, win_size_ls, normalize_each_seq=False, warm_up_time_points=''): """recives df of the data, no_sensors_cols (ls): the columns that doesnt represent sensors win_size_ls (ls): win sizes to produce (each one will be a channel in reverse order) warm_up_time_points returns list of representations (sigmat) with n dim (number of channels) for each scale, for each iter - X(PADED TO THE MAX LENGTH) shape = (num of seqs, length of seq, num of sensors/features) - y and - keys ('drone', 'update_step', 'iter') for later identification """ # compute y - if one of the recorsed is anomaly, all the sequnce classified as anomaly iter_ls = dataset.iter.unique() def create_sigmats_of_one_iter(dataset, iteri): # get current iter dataset_iteri = dataset.loc[dataset['iter'] == iteri, :] # get list of update steps update_step_ls = dataset_iteri.update_step.to_list() step_sig_mat_ls = [] for update_step in update_step_ls: # print('iter: ',iteri,'step: ', update_step) win_sig_mat_ls = [] for win_size in win_size_ls: # cut the df by current update step-win size current_seq = dataset_iteri.loc[ (dataset_iteri['update_step'] <= update_step) & (dataset_iteri['update_step'] > (update_step - win_size))] # drop irrelevant cols and convert to numpy current_seq = current_seq.drop(no_sensors_cols + ['label'], 1).to_numpy() if normalize_each_seq: current_seq = StandardScaler().fit_transform(current_seq) # convert to sig mat current_seq_sig_mat = seq_to_sig_matrix(current_seq) # add to thr ls -each elemnt with different win size win_sig_mat_ls.append(current_seq_sig_mat) # stack the 3 win size (scale) togather as channels # stacked_mats_different_scale = np.stack(win_sig_mat_ls) # add to step ls step_sig_mat_ls.append(win_sig_mat_ls) # stack all steps # stacked_mats_of_iter = np.stack(step_sig_mat_ls) # add to iter ls iter_sig_mat_np = np.array(step_sig_mat_ls) iter_sig_mat_np = np.rollaxis(np.array(iter_sig_mat_np), 1, 4) return { 'sig_mat': iter_sig_mat_np, 'keys': dataset_iteri[['drone', 'update_step', 'iter']], 'labels': dataset_iteri.label.to_numpy() } workers = multiprocessing.cpu_count() print('Number of workers: ', workers) pool = ProcessingPool(workers) list_of_iters_dict = pool.map( lambda iter: create_sigmats_of_one_iter(dataset, iter), iter_ls) pool.close() pool.join() pool.terminate() pool.clear() iters_sig_mat_ls = [ iter_dict['sig_mat'] for iter_dict in list_of_iters_dict ] iters_lables_ls = [iter_dict['labels'] for iter_dict in list_of_iters_dict] iters_keys_ls = [iter_dict['keys'] for iter_dict in list_of_iters_dict] print( 'shape of first iter X {} shape of first iter labels {} shape keys {}'. format(iters_sig_mat_ls[0].shape, iters_lables_ls[0].shape, iters_keys_ls[0].shape)) return iters_sig_mat_ls, iters_lables_ls, iters_keys_ls
def run(self): """ Main method of the Word2Vec class. :return: the final values of the weights W1, W2 and a history of the value of the loss function vs. epoch """ if len(self.corpus) == 0: raise ValueError('You need to specify a corpus of text.') print("Creating one-hot student answer vectors") cores = mp.cpu_count() stu_dict = {} df_split = np.array_split(self.corpus, cores, axis=0) # create the multiprocessing pool pool = Pool(cores) # process the DataFrame by mapping function to each df across the pool df_out = np.vstack(pool.map(self.onehotvecs, df_split)) # close down the pool and join pool.close() pool.join() pool.clear() print("Creating student answer dictionary") row_count = 0 for i in range(0, cores): for j in range(0, len(df_out[i][0])): stu_dict[row_count] = df_out[i][0][j] row_count += 1 # initialize weight matrices print("Initializing weights") V = stu_dict[0].shape[1] W1, W2 = initialize(V, self.N) loss_vs_epoch = [] loss_low = np.inf print("Begining training") for e in trange((self.n_epochs), desc='Epochs'): loss = 0.0 rand_student_order = np.random.choice(self.corpus.shape[0], self.corpus.shape[0], replace=False) # shuffle data without replacement for i in tqdm(rand_student_order, desc='Students', leave=False): for center, context in self.trainTargetDF(stu_dict[i]): W1, W2, loss = self.method(context, center, W1, W2, loss) loss_vs_epoch.append(loss) if loss < loss_low: loss_low = loss W1_best, W2_best = W1, W2 # Early stopping and returning best result if loss > loss_vs_epoch[max(0, e - self.early_stop)]: print("Training complete. Loss now increasing.") return W1_best, W2_best, loss_vs_epoch print("Training complete.") return W1, W2, loss_vs_epoch
def build_full_hamiltonian_parallel2(clustered_ham_in, ci_vector_in, iprint=1, nproc=None, opt_einsum=True, thresh=1e-14): """ Build hamiltonian in basis in ci_vector parallelized over matrix elements """ # {{{ global clusters global ci_vector global clustered_ham print(" In build_full_hamiltonian_parallel2. nproc=", nproc) clustered_ham = clustered_ham_in clusters = clustered_ham_in.clusters ci_vector = ci_vector_in H = np.zeros((len(ci_vector), len(ci_vector))) n_clusters = len(clusters) def do_parallel_work(v_curr): fock_l = v_curr[0] conf_l = v_curr[1] idx_l = v_curr[2] out = [] idx_r = -1 for fock_r in ci_vector.fblocks(): confs_r = ci_vector[fock_r] delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0], fock_l[ci][1] - fock_r[ci][1]) for ci in range(len(clusters))]) try: terms = clustered_ham.terms[delta_fock] except KeyError: idx_r += len(confs_r) continue for conf_r in confs_r: idx_r += 1 if idx_l > idx_r: continue me = 0 for term in terms: me += term.matrix_element(fock_l, conf_l, fock_r, conf_r) #if abs(me) > thresh: out.append((idx_r, me)) return out # def parallel_work(inp): # fock_l = inp[0] # fock_r = inp[1] # conf_l = inp[2] # conf_r = inp[3] # idx_l = inp[4] # idx_r = inp[5] # out = [idx_l, idx_r, None] # # delta_fock= tuple([(fock_l[ci][0]-fock_r[ci][0], fock_l[ci][1]-fock_r[ci][1]) for ci in range(len(clusters))]) # try: # terms = clustered_ham.terms[delta_fock] # # for config_ri, config_r in enumerate(configs_r): # idx_r = shift_r + config_ri # if idx_r<idx_l: # continue # # for term in terms: # me = term.matrix_element(fock_l,config_l,fock_r,config_r) # H[idx_l,idx_r] += me # if idx_r>idx_l: # H[idx_r,idx_l] += me # #print(" %4i %4i = %12.8f"%(idx_l,idx_r,me)," : ",config_l,config_r, " :: ", term) # # except KeyError: # continue rows = [] idx_row = 0 for fock1, conf1, coeff1 in ci_vector: rows.append((fock1, conf1, idx_row)) idx_row += 1 import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool if nproc == None: pool = Pool() else: pool = Pool(processes=nproc) Hrows = pool.map(do_parallel_work, rows) pool.close() pool.join() pool.clear() for row_idx, row in enumerate(Hrows): for col_idx, term in row: assert (col_idx >= row_idx) H[row_idx, col_idx] = term H[col_idx, row_idx] = term return H
def compute_tr_cds_relative_coordinates( self ): Logger.get_instance().info( 'Starting the computation of relative CDS transcript start and stop' + ' coordinates (registered in the Transcript table).') # Get all the transcript for which there are CDS # start and stop positions provided # NB: Query is performed using raw SQL statement for better efficiency transcript_info_sql_statement = 'SELECT Transcript.id, Transcript.transcript_id AS tr_id, \ Transcript.gene_id, PROGene.chromosome, \ Transcript.cds_start_pos AS start_pos, \ Transcript.cds_stop_pos AS end_pos \ FROM Transcript \ INNER JOIN PROGene ON PROGene.gene_id = Transcript.gene_id \ WHERE ( Transcript.cds_start_pos IS NOT NULL ) \ AND ( Transcript.cds_stop_pos IS NOT NULL )' if ( not self.force_overwrite ): transcript_info_sql_statement += ' AND ( ( Transcript.rel_cds_start_pos IS NULL ) \ OR ( Transcript.rel_cds_stop_pos IS NULL ) )' transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_tr_cds_relative_coordinates(): ' + str( transcript_info_df.shape[0] ) + ' Transcript entries are' + ' expected to be processed.') # As the conversion of coordinates in R may be highly time-consuming, # split the data frame into small data frames and multi-process the # computation # Split the data frame into smaller data frames that can be processed # independently from each other subset_data_frames = [ transcript_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \ for min_bound in xrange( 0, transcript_info_df.shape[ 0 ], Constants.MAX_ENTRIES_PER_DATAFRAME ) ] # For each of the subset data frame, process it with R in order # to build a dataset containing the start and stop relative # coordinates. # Instantiate the list of tuple-embedded arguments necessary to # compute the relative coordinates args_to_run_r = [] filename_prefix = self.TRANSCRIPT_CSV_FILE_PREFIX filename_suffix = 0 for df in subset_data_frames: args_to_run_r.append( ( df, self.species, self.ensembl_release_version, filename_prefix, filename_suffix ) ) filename_suffix += 1 # Instantiate the pool of processes p = Pool( self.thread_nb ) messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r ) p.close() # Wait for all processes to be completed p.join() # Log the messages generated by the processes for messages in messages_to_log: ( debug_messages_to_log, stdout, stderr ) = messages for message in debug_messages_to_log: Logger.get_instance().debug( message ) if ( stdout != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following standard output: \n' + stdout ) # NB: As the R function is susceptible to write not error-related # messages in stderr, these messages are also logged at the # debug level if ( stderr != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following error output: \n' + stderr ) # Sequentially open CSV files to get the relative positions # Instantiate a dictionary that associate to the ORFTranscriptAsso ID # the relative start and stop positions of the ORF rel_positions_dict = {} for file_nb in range( filename_suffix ): df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER, filename_prefix + str( file_nb ) + '.csv' ), sep = ',', encoding = 'utf-8' ) for ( index, row ) in df.iterrows(): rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] ) # Add the relative start and stop positions for all the ORFTranscriptAsso entries all_transcripts = SQLManagerPRO.get_instance().get_session().query( Transcript ).filter( Transcript.id.in_( rel_positions_dict.keys() ) ).all() for transcript in all_transcripts: # Get the start and stop positions positions = rel_positions_dict.get( transcript.id ) rel_cds_start_pos = positions[ 0 ] rel_cds_stop_pos = positions[ 1 ] if not pd.isna( rel_cds_start_pos ): transcript.rel_cds_start_pos = int( rel_cds_start_pos ) if not pd.isna( rel_cds_stop_pos ): transcript.rel_cds_stop_pos = int( rel_cds_stop_pos ) # Commit the updates and close the session SQLManagerPRO.get_instance().commit() SQLManagerPRO.get_instance().close_session() # Delete the pool instance p.clear()
class tiledRasterReader: ''' ''' def __init__(self, srcRasterfile, xoff=0, yoff=0, xsize=None, ysize=None): ''' ''' #print('Initializing reader...') self.srcRasterfile = srcRasterfile gdal.SetCacheMax(2**30) # 1 GB self.ds = gdal.Open(self.srcRasterfile, gdalconst.GA_ReadOnly) #print('self.ds: ', self.ds) if '.vrt' in self.srcRasterfile: self.fileList = self.ds.GetFileList()[1:] #print('self.fileList: ', self.fileList) self.measurement_level_ints = [] for fn in self.fileList: # default level of measurement msrlevel = conf.MSR_LEVEL_RATIO for keyword in conf.NOMINAL_KEYWORD_IN_FN: if keyword in fn: msrlevel = conf.MSR_LEVEL_NOMINAL break for key in conf.MSR_LEVELS: if conf.MSR_LEVELS[key] == msrlevel: self.measurement_level_ints.append(int(key)) break self.measurement_level_ints = np.array(self.measurement_level_ints) self.nbands = self.ds.RasterCount self.nrows = self.ds.RasterYSize self.ncols = self.ds.RasterXSize self.geotransform = self.ds.GetGeoTransform() self.projection = self.ds.GetProjection() print('%s:\n\t%d rows %d columns' % (self.srcRasterfile, self.nrows, self.ncols)) band = self.ds.GetRasterBand(1) self.nodata = band.GetNoDataValue() ## each band may have a different nodata value nodatas = [] for b in range(1, self.nbands + 1): #print('band %d nodata: %.2f' % (b, self.ds.GetRasterBand(b).GetNoDataValue())) nodatas.append(self.ds.GetRasterBand(b).GetNoDataValue()) self.nodatas = np.array(nodatas) ''' for i in range(1, self.nbands + 1): b = self.ds.GetRasterBand(1) nd = b.GetNoDataValue() print('band %d nd %.2f' % (i, nd)) ''' self.block_ysize_base = band.GetBlockSize()[0] #print('self.fileList', self.fileList) if '.vrt' in self.srcRasterfile: self.block_xsize_base = gdal.Open( self.fileList[0], gdalconst.GA_ReadOnly).GetRasterBand(1).GetBlockSize()[0] else: #self.block_xsize_base = self.ds.GetRasterBand(1).GetBlockSize()[1] self.block_xsize_base = band.GetBlockSize()[1] #print('\t%d x %d' % (self.block_xsize_base, self.block_ysize_base)) self.__N_TilesRead = 0 self.xoff, self.yoff = xoff, yoff if xsize is None: self.xsize = self.block_xsize_base elif xsize > self.ncols: print('tile xsize exceeds RasterXsize %d' % self.ncols) sys.exit(1) else: self.xsize = xsize if ysize is None: self.ysize = self.block_ysize_base elif ysize > self.nrows: print('tile xsize exceeds RasterYsize %d' % self.nrows) sys.exit(1) else: self.ysize = ysize ## estimated data size (in MB) self.estimate_TotalSize_MB = self.estimateTileSize_MB( self.nrows, self.ncols) self.estimate_TileSize_MB = self.estimateTileSize_MB( self.xsize, self.ysize) # min, max, mean, stddev self.statistics = np.zeros((self.nbands, 4)) for i in range(self.nbands): self.statistics[i] = self.ds.GetRasterBand(i + 1).GetStatistics( 0, 1) #self.statistics[i] = np.array([0, 1, 0, 1]) self.MP_pool = None #print('Done initializing reader...') def estimateTileSize_MB(self, xsize=None, ysize=None): ''' ''' if xsize is None: xsize = self.xsize if ysize is None: ysize = self.ysize return np.array([ 1.0 ]).astype('float32').nbytes / 1024.0**2 * xsize * ysize * self.nbands def readWholeRaster(self, multithread=conf.MULTITHREAD_READ): data = None if multithread: def threadReadingByBand(i, rasterfile): ''' each thread reads a whole band using multiprocess pool ''' import gdal, gdalconst, psutil, conf import numpy as np ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly) data = ds.GetRasterBand(i).ReadAsArray() return data # optimal for multi-thread reading by band n_threads = self.nbands if self.MP_pool is None: self.MP_pool = Pool(n_threads) ## multi-thread reading by band band_idx = range(1, n_threads + 1) fns = np.array([self.srcRasterfile]).repeat(n_threads) data = self.MP_pool.map(threadReadingByBand, band_idx, fns) data = np.stack(data, axis=0) self.MP_pool.clear() else: data = self.ds.ReadAsArray(xoff=0, yoff=0, xsize=None, ysize=None) ## nodatavalues #if self.nodata < 0: # data[data < self.nodata] = self.nodata return data def readNextTile(self, xsize=None, ysize=None, multithread=conf.MULTITHREAD_READ): ## update xsize and ysize if needed ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile) if xsize is not None: self.xsize = xsize if ysize is not None: self.ysize = ysize N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize)) y = int(self.__N_TilesRead / N_BLOCK_X) x = self.__N_TilesRead - y * N_BLOCK_X self.xoff = min(x * self.xsize, self.ncols) xsize = min(self.xsize, self.ncols - self.xoff) self.yoff = min(y * self.ysize, self.nrows) ysize = min(self.ysize, self.nrows - self.yoff) if self.xoff == self.ncols or self.yoff == self.nrows: return (None, self.xoff, self.yoff, 0, 0) data = None if multithread: ## multi-thread read def threadReadingByBand(i, param, rasterfile): ''' each thread reads a band, with tile dimension spec in param using multiprocess pool ''' import gdal, gdalconst import numpy as np ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly) data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0], yoff=param[1], win_xsize=param[2], win_ysize=param[3]) return data # optimal for multi-thread reading by band n_threads = self.nbands # - 1 if self.MP_pool is None: self.MP_pool = Pool(n_threads) ## multi-thread reading by band params = [] for i in range(n_threads): params.append([self.xoff, self.yoff, xsize, ysize]) fns = np.array([self.srcRasterfile]).repeat(n_threads) band_idx = range(1, self.nbands + 1) data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns) data = np.stack(data, axis=0) self.MP_pool.clear() else: ## single-thread read data = self.ds.ReadAsArray(xoff=self.xoff, yoff=self.yoff, xsize=xsize, ysize=ysize) ## nodatavalues #if self.nodata < 0: # data[data < self.nodata] = self.nodata self.__N_TilesRead += 1 return (data, self.xoff, self.yoff, xsize, ysize) def setNTilesRead(self, N): self.__N_TilesRead = N def readNextTileOverlap(self, xsize=None, ysize=None, overlap=2, multithread=conf.MULTITHREAD_READ): ## update xsize and ysize if needed ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile) if xsize is not None: self.xsize = xsize if ysize is not None: self.ysize = ysize N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize)) y = int(self.__N_TilesRead / N_BLOCK_X) x = self.__N_TilesRead - y * N_BLOCK_X self.xoff = min(x * self.xsize, self.ncols) xsize = min(self.xsize, self.ncols - self.xoff) self.yoff = min(y * self.ysize, self.nrows) ysize = min(self.ysize, self.nrows - self.yoff) if self.xoff == self.ncols or self.yoff == self.nrows: return (None, self.xoff, self.yoff, 0, 0, -1, -1) data = None if multithread: ## multi-thread read def threadReadingByBand(i, param, rasterfile): ''' each thread reads a band, with tile dimension spec in param using multiprocess pool ''' import gdal, gdalconst import numpy as np ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly) data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0], yoff=param[1], win_xsize=param[2], win_ysize=param[3]) return data # optimal for multi-thread reading by band n_threads = self.nbands # - 1 if self.MP_pool is None: self.MP_pool = Pool(n_threads) ## multi-thread reading by band params = [] for i in range(n_threads): _xoff = max(0, self.xoff - overlap) _yoff = max(0, self.yoff - overlap) if _xoff == 0: _xsize = min(xsize + overlap, self.ncols - self.xoff) else: _xsize = min(xsize + 2 * overlap, self.ncols - self.xoff) if _yoff == 0: _ysize = min(ysize + overlap, self.nrows - self.yoff) else: _ysize = min(ysize + 2 * overlap, self.nrows - self.yoff) params.append([_xoff, _yoff, _xsize, _ysize]) #params.append([self.xoff, self.yoff, xsize, ysize]) fns = np.array([self.srcRasterfile]).repeat(n_threads) band_idx = range(1, self.nbands + 1) data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns) data = np.stack(data, axis=0) self.MP_pool.clear() else: ## single-thread read _xoff = max(0, self.xoff - overlap) _yoff = max(0, self.yoff - overlap) if _xoff == 0: _xsize = min(xsize + overlap, self.ncols - self.xoff) else: _xsize = min(xsize + 2 * overlap, self.ncols - self.xoff) if _yoff == 0: _ysize = min(ysize + overlap, self.nrows - self.yoff) else: _ysize = min(ysize + 2 * overlap, self.nrows - self.yoff) #print('inside', self.xoff, self.yoff, self.xsize, self.ysize) #print('inside', _xoff, _yoff, _xsize, _ysize) data = self.ds.ReadAsArray(xoff=_xoff, yoff=_yoff, xsize=_xsize, ysize=_ysize) #data = self.ds.ReadAsArray(xoff=self.xoff, yoff=self.yoff, xsize=xsize, ysize=ysize) ## nodatavalues #if self.nodata < 0: # data[data < self.nodata] = self.nodata self.__N_TilesRead += 1 #return (data, _xoff, _yoff, _xsize, _ysize) return (data, self.xoff, self.yoff, xsize, ysize, _xoff, _yoff) def reset(self): ''' reset after reading tiles ''' self.xoff, self.yoff = 0, 0 self.__N_TilesRead = 0 def extractByXY(self, x, y, xsize=1, ysize=1): ''' Extract raster value by x, y coordinates ''' xoff = int((x - self.geotransform[0]) / self.geotransform[1]) yoff = int((y - self.geotransform[3]) / self.geotransform[5]) return self.ds.ReadAsArray(xoff, yoff, xsize, ysize) def extractByNbrhd(self, centerX, centerY, nbrXsize=1, nbrYsize=1): ''' Extract raster value by x, y coordinates ''' xoff = int((x - self.geotransform[0]) / self.geotransform[1]) yoff = int((y - self.geotransform[3]) / self.geotransform[5]) return self.ds.ReadAsArray(xoff - int(nbrXsize / 2), yoff - int(nbrYsize / 2), nbrXsize, nbrYsize) def extractByNbrhd_batch(self, centerXs, centerYs, nbrXsize=1, nbrYsize=1): ''' Extract raster value by x, y coordinates ''' xoffs = ((centerXs - self.geotransform[0]) / self.geotransform[1]).astype(int) - int(nbrXsize / 2) yoffs = ((centerYs - self.geotransform[3]) / self.geotransform[5]).astype(int) - int(nbrYsize / 2) data = None for xoff, yoff in zip(xoffs, yoffs): #print('Extracting NBRHD (%d, %d)' % (xoff, yoff)) tmp = self.ds.ReadAsArray(xoff.item(), yoff.item(), nbrXsize, nbrYsize) #print(tmp.shape) tmp = np.expand_dims(tmp, axis=0) #print(tmp.shape) if data is None: data = tmp else: data = np.concatenate((data, tmp), axis=0) #print('data.shape:', data.shape) #print(data.shape) return data def extractByRC(c, r, xsize=1, ysize=1): '''Extract raster value by row, col ''' return self.ds.ReadAsArray(c, r, xsize, ysize) def close(self): self.ds = None if self.MP_pool is not None: self.MP_pool.clear()
class RPKI_Validator_Wrapper: """This class gets validity data from ripe""" __slots__ = ['total_prefix_origin_pairs', "_process", "_table_input", "_rpki_file"] # Sorry for the crazy naming scheme, must be done to avoid # having install file names in multiple locations temp_install_path = "/tmp/temp_rpki_validator_install" rpki_package_path = RPKI_PACKAGE_PATH rpki_run_name = RPKI_RUN_NAME rpki_run_path = RPKI_PACKAGE_PATH + RPKI_RUN_NAME rpki_db_paths = [RPKI_PACKAGE_PATH + x for x in ["db/", "rsync/"]] port = 8080 api_url = "http://[::1]:8080/api/" def __init__(self, **kwargs): config_logging(kwargs.get("stream_level", logging.INFO), kwargs.get("section")) self._table_input = kwargs.get("table_input", "mrt_rpki") if not os.path.exists(self.rpki_package_path): logging.warning("Looks like validator is not installed") logging.warning("Installing validator now") RPKI_Validator_Wrapper.install(**kwargs) ################################# ### Context Manager Functions ### ################################# def __enter__(self): """Runs the RPKI Validator""" utils.kill_port(self.port) # Must remove these to ensure a clean run utils.clean_paths(self.rpki_db_paths) cmds = [f"cd {self.rpki_package_path}", f"chown -R root:root {self.rpki_package_path}"] utils.run_cmds(cmds) # Writes validator file and serves it # Can't use cntext manager here since it returns it self._rpki_file = RPKI_File(self._table_input) self._rpki_file.spawn_process() self._process = ProcessingPool() self._process.apipe(self._start_validator) self.total_prefix_origin_pairs = self._rpki_file.total_lines return self def __exit__(self, type, value, traceback): """Closes RPKI Validator""" self._process.close() self._process.terminate() self._process.join() self._process.clear() utils.kill_port(self.port, wait=False) logging.debug("Closed rpki validator") self._rpki_file.close() def _start_validator(self): """Sends start cmd to RPKI Validator""" logging.info("Starting RPKI Validator") utils.run_cmds((f"cd {self.rpki_package_path} && " f"./{self.rpki_run_name}")) ######################### ### Wrapper Functions ### ######################### def load_trust_anchors(self): """Loads all trust anchors""" utils.write_to_stdout(f"{datetime.now()}: Loading RPKI Validator\n", logging.root.level) time.sleep(60) while self._get_validation_status() is False: time.sleep(10) utils.write_to_stdout(".", logging.root.level) utils.write_to_stdout("\n", logging.root.level) self._wait(30, "Waiting for upload to bgp preview") def make_query(self, api_endpoint: str, data=True) -> dict: """Makes query to api of rpki validator""" result = utils.get_json(os.path.join(self.api_url, api_endpoint), RPKI_Validator_Wrapper.get_headers()) return result["data"] if data else result def get_validity_data(self) -> dict: """Gets the data from ripe and formats it for csv insertions""" logging.info("Getting data from ripe") assert self.total_prefix_origin_pairs < 10000000, "page size too small" # Then we get the data from the ripe RPKI validator # Todo for later, change 10mil to be total count return self.make_query("bgp/?pageSize=10000000") ######################## ### Helper Functions ### ######################## def _wait(self, time_to_sleep: int, msg: str): """logs a message and waits""" logging.debug(msg) if logging.root.level == logging.INFO: # Number of times per second to update tqdm divisor = 100 for _ in trange(time_to_sleep * divisor, desc=msg): time.sleep(1 / divisor) def _get_validation_status(self) -> bool: """Returns row count of json object for waiting""" try: for x in self.make_query("trust-anchors/statuses"): if x["completedValidation"] is False: # If anything has not been validated return false return False # All are validated. Return true return True except urllib.error.URLError as e: self._wait(60, "Connection was refused") return False ###################### ### Static methods ### ###################### @staticmethod def get_validity_dict() -> dict: """Returns the validity dict for the RPKI Validator to decode results I could have this as a class attribute but too messy I think. """ return {"VALID": ROA_Validity.VALID.value, "UNKNOWN": ROA_Validity.UNKNOWN.value, "INVALID_LENGTH": ROA_Validity.INVALID_BY_LENGTH.value, "INVALID_ASN": ROA_Validity.INVALID_BY_ORIGIN.value} @staticmethod def get_headers() -> dict: """Gets the headers for all url queries to the validator""" return {"Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": 1, "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/73.0.3683.86 Safari/537.36"), "Accept": ("text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp," "image/apng,*/*;q=0.8," "application/signed-exchange;v=b3"), "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} ######################### ### Install Functions ### ######################### @staticmethod def install(**kwargs): """Installs RPKI validator with our configs. This might break in the future, but we need to do it this way for now to be able to do what we want with our own prefix origin table. """ config_logging(kwargs.get("stream_level", logging.DEBUG), kwargs.get("section")) utils.delete_paths([RPKI_Validator_Wrapper.rpki_package_path, RPKI_Validator_Wrapper.temp_install_path]) RPKI_Validator_Wrapper._download_validator() RPKI_Validator_Wrapper._change_file_hosted_location() path = RPKI_Validator_Wrapper._change_server_address() RPKI_Validator_Wrapper._config_absolute_paths(path) @staticmethod def _download_validator(): """Downloads validator into proper location""" rpki_url = ("https://ftp.ripe.net/tools/rpki/validator3/beta/generic/" "rpki-validator-3-latest-dist.tar.gz") arin_tal = ("https://www.arin.net/resources/manage/rpki/" "arin-ripevalidator.tal") # This is the java version they use so we will use it cmds = [f"mkdir {RPKI_Validator_Wrapper.temp_install_path}", f"cd {RPKI_Validator_Wrapper.temp_install_path}", "sudo apt-get -y install openjdk-8-jre", f"wget {rpki_url}", "tar -xvf rpki-validator-3-latest-dist.tar.gz", "rm -rf rpki-validator-3-latest-dist.tar.gz", f"mv rpki-validator* {RPKI_Validator_Wrapper.rpki_package_path}", f"cd {RPKI_Validator_Wrapper.rpki_package_path}", "cd preconfigured-tals", f"wget {arin_tal}"] utils.run_cmds(cmds) @staticmethod def _change_file_hosted_location(): """Changes location of input ann for bgp preview file""" # Changes where the file is hosted path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf" "/application-defaults.properties") prepend = "rpki.validator.bgp.ris.dump.urls=" replace = ("https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz," "https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz") replace_with = (f"http://localhost:{RPKI_File.port}" f"/{RPKI_File.hosted_name}") utils.replace_line(path, prepend, replace, replace_with) @staticmethod def _change_server_address(): """Prob because of a proxy, but on our server this is necessary""" # Changes the server address path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf" "/application.properties") prepend = "server.address=" replace = "localhost" replace_with = "0.0.0.0" utils.replace_line(path, prepend, replace, replace_with) return path @staticmethod def _config_absolute_paths(path): """Configure rpki validator to run off absolute paths This is necessary due to script being called from elsewhere In other words not from inside the RPKI dir. """ # Since I am calling the script from elsewhere these must be # absolute paths prepend = "rpki.validator.data.path=" replace = "." # Must remove trailing backslash at the end replace_with = RPKI_Validator_Wrapper.rpki_package_path[:-1] utils.replace_line(path, prepend, replace, replace_with) prepend = "rpki.validator.preconfigured.trust.anchors.directory=" replace = "./preconfigured-tals" replace_with = (f"{RPKI_Validator_Wrapper.rpki_package_path}" "preconfigured-tals") utils.replace_line(path, prepend, replace, replace_with) prepend = "rpki.validator.rsync.local.storage.directory=" replace = "./rsync" replace_with = f"{RPKI_Validator_Wrapper.rpki_package_path}rsync" utils.replace_line(path, prepend, replace, replace_with)
def calcQM(self, data, parallel=mult.cpu_count()): #Change our working directory to gauDir+name and then back at the end curPath = os.getcwd() os.chdir(self.gauDir + self.name) traj = data['traj'] totT = np.shape(traj)[0] numNuc = len(self.molGamma) #runList keeps track of the timepoints that still need to be calculated #only add if the log file does not yet exist or is incomplete runList = [] #Create the variables that we will read in from Gaussian shield = np.zeros([totT, numNuc, 3, 3], np.float32) #Shield matrix chargesMul = np.zeros([totT, numNuc], np.float32) chargesESP = np.zeros([totT, numNuc], np.float32) potential = np.zeros([totT, numNuc], np.float32) field = np.zeros([totT, numNuc, 3], np.float32) #X, Y, Z gradient = np.zeros([totT, numNuc, 6], np.float32) # XX, YY, ZZ, XY, XZ, YZ #Read in any log files that already exist for t in range(totT): logFile = self.name + '-' + str(t) + 'NMR.log' #If the file doesn't exist, add to runlist and move to next timepoint if not os.path.isfile(logFile): runList.append([t, traj[t, :, :]]) continue #Look at log file and read in all variables inFile = open(logFile, 'r') lines = inFile.readlines() inFile.close() ##NMR properties - Find the start of the NMR calculation NMRLoc = -1 for j, l in enumerate(lines): if l.find("SCF GIAO Magnetic shielding tensor (ppm):") > -1: NMRLoc = j break if NMRLoc > 0: NMRdata = lines[NMRLoc + 1:] for nuc in range(numNuc): #Skip to the position of the nucleus of interest nucData = NMRdata[5 * nuc:5 * (nuc + 1)] #Get each 3 separately - nucData[0] is total chemical shift #Need to split on both whitespace and = as sometimes big # values don't have a whitespace. line = nucData[1].replace('=', ' ').split() #XX YX ZX shield[t, nuc, 0, :] = [ float(line[1]), float(line[3]), float(line[5]) ] line = nucData[2].replace('=', ' ').split() #XY YY ZY shield[t, nuc, 1, :] = [ float(line[1]), float(line[3]), float(line[5]) ] line = nucData[3].replace('=', ' ').split() #XZ YZ ZZ shield[t, nuc, 2, :] = [ float(line[1]), float(line[3]), float(line[5]) ] ##Mulliken charges MulLoc = -1 for j, l in enumerate(lines): if l.find("Mulliken charges:") > -1: MulLoc = j break if MulLoc > 0: ChargeData = lines[MulLoc + 2:] for nuc in range(numNuc): chargesMul[t, nuc] = float(ChargeData[nuc].split()[2]) ##Electrostatic properties - Find the start of the Prop calculation PropLoc = -1 for j, l in enumerate(lines): if l.find("Electrostatic Properties (Atomic Units)") > -1: PropLoc = j break if PropLoc > 0: PotField = lines[PropLoc + 6:] for nuc in range(numNuc): line = PotField[nuc].split() potential[t, nuc] = float(line[2]) field[t, nuc, :] = [ float(line[3]), float(line[4]), float(line[5]) ] for j, l in enumerate(PotField): if l.find("Electric Field Gradient") > -1: GradLoc = j break Grad = PotField[GradLoc + 3:] for nuc in range(numNuc): line = Grad[nuc].split() gradient[t, nuc, :3] = [ float(line[2]), float(line[3]), float(line[4]) ] for j, l in enumerate(Grad): if l.find("Electric Field Gradient") > -1: GradLoc = j break Grad2 = Grad[GradLoc + 3:] for nuc in range(numNuc): line = Grad2[nuc].split() gradient[t, nuc, 3:] = [ float(line[2]), float(line[3]), float(line[4]) ] ##ESP Charges: ESPLoc = -1 for j, l in enumerate(lines): if l.find(" ESP charges:") > -1: ESPLoc = j break if ESPLoc > 0: ChargeData = lines[ESPLoc + 2:] for nuc in range(numNuc): chargesESP[t, nuc] = float(ChargeData[nuc].split()[2]) #If the data is not found, rename the log file and recalculate else: print( "Error! NMR/Electrostatic Calculation not found in file " + logFile) os.rename(logFile, logFile + '-fail.log') runList.append([t, traj[t, :, :]]) #There are datapoints to calculate. Do this in parallel if len(runList) > 0: #Run the remaining calcualtions in parallel print("Calculating data for " + str(len(runList)) + " timepoints") p = Pool(parallel) p.map(self.buildSubSystem, runList) p.clear() print("Must run calcShield again to get results") os.chdir(curPath) return os.chdir(curPath) return { "traj": traj, "shield": shield, "chargesMul": chargesMul, "chargesESP": chargesESP, "potential": potential, "field": field, "gradient": gradient }
def build_hamiltonian_diagonal_parallel2(clustered_ham, ci_vector, nproc=None, batch_size=100): """ Build hamiltonian diagonal in basis in ci_vector """ # {{{ print(" In build_hamiltonian_diagonal_parallel2. nproc=", nproc) global _clustered_ham global _delta_fock _clustered_ham = clustered_ham _delta_fock = tuple([(0, 0) for ci in range(len(clustered_ham.clusters))]) def do_parallel_work(v_batch): tmpout = [] for v_curr in v_batch: tmp = 0 fockspace = v_curr[0] config = v_curr[1] terms = _clustered_ham.terms[_delta_fock] ## add diagonal energies for term in terms: #tmp += term.matrix_element(fockspace,config,fockspace,config) #tmp += term.diag_matrix_element(fockspace,config,opt_einsum=False) mats = [] # state sign is always 1 here, since an even number of creation/annihilation operators can only # contribute to diagonal state_sign = 1 n_active = 0 for oi, o in enumerate(term.ops): if o == '': continue n_active += 1 if n_active == 1: ci = term.active[0] tmp += term.clusters[ci].ops['H'][( fockspace[ci], fockspace[ci])][config[ci], config[ci]] elif n_active > 0: for oi, o in enumerate(term.ops): if o == '': continue try: do = term.clusters[oi].ops[o] except KeyError: print(" Couldn't find:", term) exit() try: d = do[(fockspace[oi], fockspace[oi])][config[oi], config[oi]] #D(I,J,:,:...) except KeyError: continue mats.append(d) if len(mats) < n_active: continue tmp += np.einsum(term.contract_string, *mats, term.ints, optimize=False) tmpout.append(tmp) print(".", end="", flush=True) return tmpout import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool if nproc == None: pool = Pool() else: pool = Pool(processes=nproc) print(" Using Pathos library for parallelization. Number of workers: ", pool.ncpus) # define batches conf_batches = [] batch_size = min(batch_size, len(ci_vector)) batch = [] print(" Form batches. Max batch size: ", batch_size) for i, j, k in ci_vector: if len(batch) < batch_size: batch.append((i, j)) else: conf_batches.append(batch) batch = [] batch.append((i, j)) if len(batch) > 0: conf_batches.append(batch) batch = [] if len(ci_vector) == 0: return np.array([]) print(" Number of configs: ", len(ci_vector)) print(" Number of batches: ", len(conf_batches)) print(" Batches complete : ") out = pool.map(do_parallel_work, conf_batches) print() pool.close() pool.join() pool.clear() Hdv = np.zeros((len(ci_vector), )) count = 0 for oi in out: for oij in oi: Hdv[count] = oij count += 1 return Hdv
def build_hamiltonian_diagonal_parallel1(clustered_ham_in, ci_vector, nproc=None): """ Build hamiltonian diagonal in basis in ci_vector """ # {{{ global clusters global clustered_ham print(" In build_hamiltonian_diagonal_parallel1. nproc=", nproc) clustered_ham = clustered_ham_in clusters = clustered_ham_in.clusters global delta_fock delta_fock = tuple([(0, 0) for ci in range(len(clusters))]) def do_parallel_work(v_curr): fockspace = v_curr[0] config = v_curr[1] coeff = v_curr[2] terms = clustered_ham.terms[delta_fock] ## add diagonal energies tmp = 0 for term in terms: #tmp += term.matrix_element(fockspace,config,fockspace,config) tmp += term.diag_matrix_element(fockspace, config, opt_einsum=False) return tmp import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool if nproc == None: pool = Pool() else: pool = Pool(processes=nproc) print(" Using Pathos library for parallelization. Number of workers: ", pool.ncpus) #chunksize = 100 #print(" Chunksize: ", chunksize) #out = pool.map(do_parallel_work, ci_vector, chunksize=chunksize) if len(ci_vector) == 0: return np.array([]) out = pool.map(do_parallel_work, ci_vector) pool.close() pool.join() pool.clear() #out = pool.map(do_parallel_work, ci_vector, batches=100) #out = list(map(do_parallel_work, ci_vector)) #Hdv = np.zeros((len(ci_vector))) #for o in out: # Hdv[o[0]] = o[1] Hdv = np.array(out) return Hdv
def grow_hamiltonian_parallel(h_old, clustered_ham, ci_vector, ci_vector_old, iprint=1, nproc=None, opt_einsum=True, thresh=1e-14): """ Grow the Hamiltonian matrix by building only the new matrix elements for the new space indicated by ci_vector parallelized over matrix elements """ # {{{ print(" In grow_hamiltonian_parallel. nproc=", nproc) start = time.time() ci_vector_old.prune_empty_fock_spaces() ci_vector.prune_empty_fock_spaces() old_dim = len(ci_vector_old) old_basis = ci_vector_old.copy() new_basis = ci_vector.copy() full_basis = ci_vector.copy() old_basis.set_vector(np.array(range(len(old_basis)))) new_basis.set_vector(np.array(range(len(new_basis)))) full_basis.set_vector(np.array(range(len(full_basis)))) for f, c, v in old_basis: del new_basis[f][c] new_basis.prune_empty_fock_spaces() print(" Size of old space:", len(old_basis)) print(" Size of new space:", len(new_basis)) print(" Size of all space:", len(full_basis)) assert (len(full_basis) == len(old_basis) + len(new_basis)) clusters = clustered_ham.clusters H = np.zeros((len(ci_vector), len(ci_vector))) n_clusters = len(clusters) # find locations of old basis is full basis t1 = time.time() full_inds = np.zeros((len(old_basis)), dtype=int) count = 0 for f1, cs1 in old_basis.items(): for c1, i1 in old_basis[f1].items(): full_inds[count] = full_basis[f1][c1] count += 1 for idx, i in enumerate(full_inds): H[i, full_inds] = h_old[idx, :] print(" updating matrix:", time.time() - t1, flush=True) t1 = time.time() #for f1,cs1 in old_basis.items(): # for c1,i1 in old_basis[f1].items(): # for f2,cs2 in old_basis.items(): # for c2,i2 in old_basis[f2].items(): # H[full_basis[f1][c1],full_basis[f2][c2]] = h_old[i1,i2] #print("t:",time.time()-t1,flush=True) if len(new_basis) == 0: return H for f1, c1, i1 in new_basis: assert (new_basis[f1][c1] == full_basis[f1][c1]) for f1, c1, i1 in old_basis: old_basis[f1][c1] = full_basis[f1][c1] if f1 in new_basis: assert (c1 not in new_basis[f1]) global _h global _new_basis #global _old_basis #global _full_basis _h = clustered_ham _new_basis = new_basis #_old_basis = old_basis #_full_basis = full_basis debug = 0 if debug: try: assert (np.amax(np.abs(H - H.T)) < 1e-14) except AssertionError: for f1, c1, i1 in full_basis: for f2, c2, i2 in full_basis: if abs(H[i1, i2] - H[i2, i1]) > 1e-14: print(f1, c1, i1) print(f2, c2, i2) print(H[i1, i2] - H[i2, i1]) raise AssertionError #def do_parallel_work(fock_l, conf_l, idx_l, basis_r): def do_parallel_work(inp): fock_l = inp[0] conf_l = inp[1] idx_l = inp[2] new = inp[ 3] # which subspace is _l in? this is 0 for old and 1 for new out = [] if new: for fock_r in _new_basis.fblocks(): confs_r = _new_basis[fock_r] delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0], fock_l[ci][1] - fock_r[ci][1]) for ci in range(len(_h.clusters))]) if delta_fock in _h.terms: for conf_r in confs_r: idx_r = _new_basis[fock_r][conf_r] if idx_l <= idx_r: me = 0 for term in _h.terms[delta_fock]: me += term.matrix_element( fock_l, conf_l, fock_r, conf_r) out.append((idx_r, me)) else: for fock_r in _new_basis.fblocks(): confs_r = _new_basis[fock_r] delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0], fock_l[ci][1] - fock_r[ci][1]) for ci in range(len(_h.clusters))]) if delta_fock in _h.terms: for conf_r in confs_r: idx_r = _new_basis[fock_r][conf_r] me = 0 for term in _h.terms[delta_fock]: me += term.matrix_element(fock_l, conf_l, fock_r, conf_r) out.append((idx_r, me)) print(".", end='', flush=True) return ([idx_l, out]) import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool if nproc == None: pool = Pool() else: pool = Pool(processes=nproc) jobs = [(i[0], i[1], i[2], 1) for i in new_basis] jobs.extend([(i[0], i[1], i[2], 0) for i in old_basis]) stop = time.time() print(" Time spent finding new subspace:", stop - start) start = time.time() print(" Number of jobs to do:", len(jobs), flush=True) results = pool.map(do_parallel_work, jobs) print("") stop = time.time() print(" Time spent building new subspace:", stop - start) pool.close() pool.join() pool.clear() for result in results: row_idx = result[0] row = result[1] for col in row: col_idx = col[0] term = col[1] #assert( col_idx >= row_idx) assert (abs(H[row_idx, col_idx]) < 1e-16) assert (abs(H[col_idx, row_idx]) < 1e-16) H[row_idx, col_idx] = term H[col_idx, row_idx] = term return H
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str) parser.add_argument('--exp_name', type=str, default='vpg') parser.add_argument('--render', action='store_true') parser.add_argument('--logdir', '-dir', type=str, default='data') parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--n_iter', '-n', type=int, default=100) parser.add_argument('--batch_size', '-b', type=int, default=1000) parser.add_argument('--ep_len', '-ep', type=float, default=-1.) parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--reward_to_go', '-rtg', action='store_true') parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true') parser.add_argument('--nn_baseline', '-bl', action='store_true') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--n_experiments', '-e', type=int, default=1) parser.add_argument('--n_layers', '-l', type=int, default=1) parser.add_argument('--size', '-s', type=int, default=32) parser.add_argument('--gae', '-gae', action='store_true') parser.add_argument('--lambd', '-ld', type=float, default=1.0) parser.add_argument('--threads', '-th', type=int, default=1) parser.add_argument('--max_threads_pool', '-max_tp', type=int, default=16) parser.add_argument('--thread_timeout', '-th_to', type=int, default=None) parser.add_argument('--offpol', '-ofp', action='store_true') parser.add_argument('--n_iter_pol', '-np', type=int, default=1) parser.add_argument('--n_iter_pol_sched', '-nps', type=str, default='const', choices=['const', 'exp_dec']) parser.add_argument('--n_iter_pol_exp_base', '-npexpb', type=int, default=5) parser.add_argument('--n_iter_pol_exp_decay', '-npexpd', type=float, default=0.95) parser.add_argument('--weight_importance_samp', '-wis', action='store_true') parser.add_argument('--record', '-rec', type=int, default=None) args = parser.parse_args() it_pol_fn = None if args.offpol: if args.n_iter_pol_sched == 'exp_dec': it_pol_fn = lambda it: \ int(np.ceil(args.n_iter_pol * pow(args.n_iter_pol_exp_decay,it / args.n_iter_pol_exp_base))) if not (os.path.exists(args.logdir)): os.makedirs(args.logdir) logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = os.path.join(args.logdir, logdir) if not (os.path.exists(logdir)): os.makedirs(logdir) max_path_length = args.ep_len if args.ep_len > 0 else None start = time.time() for e in range(args.n_experiments): seed = args.seed + 10 * e print('Running experiment with seed %d' % seed) def train_func(): train_PG(exp_name=args.exp_name, env_name=args.env_name, n_iter=args.n_iter, gamma=args.discount, min_timesteps_per_batch=args.batch_size, max_path_length=max_path_length, learning_rate=args.learning_rate, reward_to_go=args.reward_to_go, animate=args.render, logdir=os.path.join(logdir, '%d' % seed), normalize_advantages=not (args.dont_normalize_advantages), nn_baseline=args.nn_baseline, seed=seed, n_layers=args.n_layers, size=args.size, gae=args.gae, lambd=args.lambd, threads=args.threads, max_threads_pool=args.max_threads_pool, thread_timeout=args.thread_timeout, offpol=args.offpol, n_it_pol=args.n_iter_pol, n_it_pol_fn=it_pol_fn, wis=args.weight_importance_samp, record=args.record) # Awkward hacky process runs, because Tensorflow does not like # repeatedly calling train_PG in the same thread. # p = Process(target=train_func) # p.start() # p.join() p = ProcessingPool(1) p.apipe(train_func).get() p.clear() print('All training took: {:.3f}s'.format(time.time() - start))
class tiledRasterReader: ''' ''' def __init__(self, srcRasterfile, xoff=0, yoff=0, xsize=None, ysize=None): ''' ''' self.srcRasterfile = srcRasterfile gdal.SetCacheMax(2**30) # 1 GB self.ds = gdal.Open(self.srcRasterfile, gdalconst.GA_ReadOnly) self.fileList = self.ds.GetFileList()[1:] self.measurement_level_ints = [] for fn in self.fileList: # default level of measurement msrlevel = conf.MSR_LEVEL_RATIO for keyword in conf.NOMINAL_KEYWORD_IN_FN: if keyword in fn: msrlevel = conf.MSR_LEVEL_NOMINAL break for key in conf.MSR_LEVELS: if conf.MSR_LEVELS[key] == msrlevel: self.measurement_level_ints.append(int(key)) break self.measurement_level_ints = np.array(self.measurement_level_ints) self.nbands = self.ds.RasterCount self.nrows = self.ds.RasterYSize self.ncols = self.ds.RasterXSize self.geotransfrom = self.ds.GetGeoTransform() self.projection = self.ds.GetProjection() band = self.ds.GetRasterBand(1) self.nodata = band.GetNoDataValue() self.block_ysize_base = band.GetBlockSize()[0] self.block_xsize_base = gdal.Open( self.fileList[0], gdalconst.GA_ReadOnly).GetRasterBand(1).GetBlockSize()[0] self.__N_TilesRead = 0 self.xoff, self.yoff = xoff, yoff if xsize is None: self.xsize = self.block_xsize_base elif xsize > self.ncols: print 'tile xsize exceeds RasterXsize', self.ncols sys.exit(1) else: self.xsize = xsize if ysize is None: self.ysize = self.block_ysize_base elif ysize > self.nrows: print 'tile xsize exceeds RasterYsize', self.nrows sys.exit(1) else: self.ysize = ysize ## estimated data size (in MB) self.estimate_TotalSize_MB = self.estimateTileSize_MB( self.nrows, self.ncols) self.estimate_TileSize_MB = self.estimateTileSize_MB( self.xsize, self.ysize) self.statistics = np.zeros((self.nbands, 4)) for i in range(self.nbands): self.statistics[i] = self.ds.GetRasterBand(i + 1).GetStatistics( 0, 1) self.MP_pool = None def estimateTileSize_MB(self, xsize=None, ysize=None): ''' ''' if xsize is None: xsize = self.xsize if ysize is None: ysize = self.ysize return np.array([ 1.0 ]).astype('float32').nbytes / 1024.0**2 * xsize * ysize * self.nbands def readWholeRaster(self, multithread=conf.MULTITHREAD_READ): data = None if multithread: def threadReadingByBand(i, rasterfile): ''' each thread reads a whole band using multiprocess pool ''' import gdal, gdalconst, psutil, conf import numpy as np ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly) data = ds.GetRasterBand(i).ReadAsArray() return data # optimal for multi-thread reading by band n_threads = self.nbands if self.MP_pool is None: self.MP_pool = Pool(n_threads) ## multi-thread reading by band band_idx = range(1, n_threads + 1) fns = np.array([self.srcRasterfile]).repeat(n_threads) data = self.MP_pool.map(threadReadingByBand, band_idx, fns) data = np.stack(data, axis=0) self.MP_pool.clear() else: data = self.ds.ReadAsArray(xoff=0, yoff=0, xsize=None, ysize=None) ## nodatavalues data[data < self.nodata] = self.nodata return data def readNextTile(self, xsize=None, ysize=None, multithread=conf.MULTITHREAD_READ): ## update xsize and ysize if needed ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile) if xsize is not None: self.xsize = xsize if ysize is not None: self.ysize = ysize N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize)) y = int(self.__N_TilesRead / N_BLOCK_X) x = self.__N_TilesRead - y * N_BLOCK_X self.xoff = min(x * self.xsize, self.ncols) xsize = min(self.xsize, self.ncols - self.xoff) self.yoff = min(y * self.ysize, self.nrows) ysize = min(self.ysize, self.nrows - self.yoff) if self.xoff == self.ncols or self.yoff == self.nrows: return (None, self.xoff, self.yoff, 0, 0) data = None if multithread: ## multi-thread read def threadReadingByBand(i, param, rasterfile): ''' each thread reads a band, with tile dimension spec in param using multiprocess pool ''' import gdal, gdalconst import numpy as np ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly) data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0], yoff=param[1], win_xsize=param[2], win_ysize=param[3]) return data # optimal for multi-thread reading by band n_threads = self.nbands # - 1 if self.MP_pool is None: self.MP_pool = Pool(n_threads) ## multi-thread reading by band params = [] for i in range(n_threads): params.append([self.xoff, self.yoff, xsize, ysize]) fns = np.array([self.srcRasterfile]).repeat(n_threads) band_idx = range(1, self.nbands + 1) data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns) data = np.stack(data, axis=0) self.MP_pool.clear() else: ## single-thread read data = self.ds.ReadAsArray(xoff=self.xoff, yoff=self.yoff, xsize=xsize, ysize=ysize) ## nodatavalues data[data < self.nodata] = self.nodata self.__N_TilesRead += 1 return (data, self.xoff, self.yoff, xsize, ysize) def reset(self): ''' reset after reading tiles ''' self.xoff, self.yoff = 0, 0 self.__N_TilesRead = 0 def extractByXY(self, x, y): ''' Extract raster value by x, y coordinates ''' xoff = int((x - self.geotransfrom[0]) / self.geotransfrom[1]) yoff = int((y - self.geotransfrom[3]) / self.geotransfrom[5]) return self.ds.ReadAsArray(xoff, yoff, 1, 1) def extractByRC(c, r): '''Extract raster value by row, col ''' return self.ds.ReadAsArray(c, r, 1, 1) def close(self): self.ds = None if self.MP_pool is not None: self.MP_pool.clear()
def compute_DTW_to_each_drone(drones_df_ls, win_size, no_sensors_cols, per_series=False, process_gps=True, use_scaler=True): print('Start compute DTW') dataset = pd.concat(drones_df_ls) dataset = dataset.sort_values(['iter', 'update_step', 'drone']).reset_index(drop=True) drones = dataset.drone.unique() numOfDrones = len(drones) start = time.time() # iter = '0simple' # dataset_iteri = dataset.loc[dataset['iter'] == iter, :] iters = dataset.iter.unique() # create empty df for results # itearte over iterartions def compute_DTW_on_iter(dataset, iter, numOfDrones, drones, per_series=True): print('iter: ', iter) dtw_results_dict = { 'iter': [], 'update_step': [], 'drone': [], 'comparison_drone': [], 'DTW_dist': [] } # print('iter: ',iter ) dataset_iter = dataset.loc[dataset['iter'] == iter, :] # cut the df by current update step-win size update_step_ls = dataset_iter.update_step.unique() # num of features (all columns - no sensor columns and label num_of_features = dataset_iter.shape[1] - len(no_sensors_cols + ['label']) # iterate over time steps for update_step in update_step_ls: current_seq = dataset_iter.loc[ (dataset_iter['update_step'] <= update_step) & (dataset_iter['update_step'] > (update_step - win_size))] # iterte over drones for droneIidx in range(numOfDrones): currentDrone = drones[droneIidx] currentDroneDf = current_seq.loc[current_seq.drone == currentDrone, :] # drop irrelevant cols and convert to numpy currentDroneNp = currentDroneDf.drop( no_sensors_cols + ['label'], 1).to_numpy() if use_scaler: scaled_currentDroneNp = StandardScaler().fit_transform( currentDroneNp) else: scaled_currentDroneNp = currentDroneNp for droneJidx in range(numOfDrones): # dont compare drone to itself if (droneIidx >= droneJidx): continue # print(droneIidx, droneJidx) otherDrone = drones[droneJidx] otherDroneDf = current_seq.loc[current_seq.drone == otherDrone, :] otherDroneNp = otherDroneDf.drop( no_sensors_cols + ['label'], 1).to_numpy() if use_scaler: scaled_otherDroneNp = StandardScaler().fit_transform( otherDroneNp) else: scaled_otherDroneNp = otherDroneNp """compute DTW""" if per_series: # compute between each pair of series, return list dist = [ dtw_path(scaled_currentDroneNp[:, i], scaled_otherDroneNp[:, i])[1] for i in range(num_of_features) ] dist = np.array(dist) else: # path, dist = dtw_path(scaled_currentDroneNp, scaled_otherDroneNp) path = '' dist = dtw(scaled_currentDroneNp, scaled_otherDroneNp, window_type="sakoechiba", window_args={ 'window_size': 60 }).distance # print('Iter {} updatestep {} DroneI {} DroneJ {} DTW {}'.format(iter,update_step,currentDrone, otherDrone, dist)) # save results of current drone dtw_results_dict['iter'].append(iter) dtw_results_dict['update_step'].append(update_step) dtw_results_dict['drone'].append(currentDrone) dtw_results_dict['comparison_drone'].append(otherDrone) dtw_results_dict['DTW_dist'].append( dist) # ; dtw_results_dict['DTW_path'].append(path) # save results of other drone dtw_results_dict['iter'].append(iter) dtw_results_dict['update_step'].append(update_step) dtw_results_dict['drone'].append(otherDrone) dtw_results_dict['comparison_drone'].append(currentDrone) dtw_results_dict['DTW_dist'].append( dist) # ; dtw_results_dict['DTW_path'].append(path) print('iter done: ', iter) return dtw_results_dict workers = multiprocessing.cpu_count() print('Number of workers: ', workers) workers = np.min([workers, len(iters)]) pool = ProcessingPool(workers) list_of_iters_dict = list( pool.map( lambda iter: compute_DTW_on_iter(dataset, iter, numOfDrones, drones, per_series), iters)) pool.close() pool.join() pool.terminate() pool.clear() # from list of dicts to one dict dtw_results_dict = { 'iter': [], 'update_step': [], 'drone': [], 'comparison_drone': [], 'DTW_dist': [] } [ dtw_results_dict[result_key].append(value) for dict in list_of_iters_dict for result_key, list in dict.items() for value in list ] print('time took: ', time.time() - start) dtw_results_df = pd.DataFrame.from_dict(dtw_results_dict) dtw_results_df = dtw_results_df.sort_values( ['iter', 'update_step', 'drone']).reset_index(drop=True) dtw_results_df_after_removal_ls = [] return dtw_results_df
import numpy as np import pandas as pd import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool df = pd.DataFrame(np.random.randint(3, 10, size=[500, 2])) def func(df): return df.shape #cores=mp.cpu_count() cores = 8 df_split = np.array_split(df, cores, axis=0) # create the multiprocessing pool pool = Pool(cores) # process the DataFrame by mapping function to each df across the pool df_out = np.vstack(pool.map(func, df_split)) # close down the pool and join pool.close() pool.join() pool.clear()
def __simLocs2Samples(self, X, parallel = True, nprocess = conf.N_PROCESS): ''' compute similarity between locations to predict and samples return: a matrix of similarity values, each row is a location, each column is a sample ''' ## this import is necessary [on Windows]: # http://stackoverflow.com/questions/28445373/python-import-numpy-as-np-from-outer-code-gets-lost-within-my-own-user-defined import numpy as np import raster, points, util, conf def simLoc2SamplesV0(loc_ev, datapkg): # this function is needed for parallel computing using multiprocessing import conf # unpack data in datapkg t0 = time.time() sample_evs = datapkg[0] evs = datapkg[1] SD_evs = datapkg[2] conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(time.time()-t0) # number of environmental variables M = SD_evs.size # number of samples N = np.shape(sample_evs)[0] sim = np.zeros(N) t0 = time.time() for i in range(N): # for each sample sim0 = np.zeros(M) sample_ev = sample_evs[i] for j in range(M): # for each environmental variable evi = loc_ev[j] evj= sample_ev[j] msrlevel = self.__envrasters[j].getMsrLevel() if msrlevel == conf.MSR_LEVEL_NOMINAL or msrlevel == conf.MSR_LEVEL_ORDINAL: if evi == evj: sim_i = 1.0 else: sim_i = 0.0 else: SD_ev = SD_evs[j] ev = evs[:,j] SD_evj = np.sqrt(np.mean((ev - evj) ** 2)) sim_i = np.exp(-0.5 * (evi - evj) ** 2 / (SD_ev ** 2 / SD_evj) ** 2) sim0[j] = sim_i sim[i] = np.min(sim0) ## limiting factor conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0) return sim def simLoc2Samples(loc_ev, datapkg): # this function is needed for parallel computing using multiprocessing import conf ## IMPORTANT - makes **conf.MSR_LEVELS** visible # unpack data in datapkg t0 = time.time() sample_evs = datapkg[0] REVS = datapkg[1] SD_evs = datapkg[2] AVG_evs = datapkg[3] SUM_DIF_SQ_AVG = datapkg[4] # Guiming 3/31/2019 MSRLEVES = datapkg[5] conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(time.time()-t0) # number of environmental variables M = SD_evs.size # number of samples N = np.shape(sample_evs)[0] sim = np.zeros(N) t0 = time.time() for i in range(N): # for each sample sim0 = np.zeros(M) sample_ev = sample_evs[i] for j in range(M): # for each environmental variable evi = loc_ev[j] evj= sample_ev[j] # Guiming 3/31/2019 - SAVES MEM, NO NEED TO DISPATCH self.__envrasters TO EACH THREAD msrlevel = MSRLEVES[j] ## this line below does not work without ** import conf ** at the begining of this function if msrlevel == conf.MSR_LEVEL_NOMINAL or msrlevel == conf.MSR_LEVEL_ORDINAL: #if msrlevel == 'nominal' or msrlevel == 'ordinal': if evi == evj: sim_i = 1.0 else: sim_i = 0.0 else: SD_ev = SD_evs[j] delta = sample_ev[j] - AVG_evs[j] tmp = SUM_DIF_SQ_AVG[j] + REVS * delta**2 SD_evj = np.sqrt(tmp/REVS) sim_i = np.exp(-0.5 * (evi - evj) ** 2 / (SD_ev ** 2 / SD_evj) ** 2) sim0[j] = sim_i sim[i] = np.min(sim0) ## limiting factor conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0) return sim try: # do dimension match check here if np.shape(X)[1] != len(self.__envrasters): print 'dimension mismatch in computing similarity in iPSM' sys.exit(1) msr_levels = [] if conf.NAIVE: evs = np.zeros((self.__envrasters[0].getData().size, len(self.__envrasters))) SD_evs = np.zeros(len(self.__envrasters)) AVG_evs = np.zeros(len(self.__envrasters)) for i in range(len(self.__envrasters)): if conf.NAIVE: evs[:, i] = self.__envrasters[i].getData().T msr_levels.append(self.__envrasters[i].getMsrLevel()) SD_evs[i] = self.__envrasters[i].std AVG_evs[i] = self.__envrasters[i].mean NROWS = np.shape(X)[0] REVS = self.__envrasters[0].getData().size SUM_DIF_SQ_AVG = REVS * SD_evs**2 samples_evs = util.extractCovariatesAtPoints(self.__envrasters, self.__soilsamples) samples_evs = np.array(samples_evs).T if not parallel: sim = np.zeros((NROWS, self.__soilsamples.size)) for i in range(NROWS): if conf.NAIVE: ## naive implementaton sim[i,:] = self.__simLoc2SamplesV0(X[i], evs, SD_evs) else: ## with optimizations sim[i,:] = self.__simLoc2Samples(X[i], samples_evs, REVS, SD_evs, AVG_evs, SUM_DIF_SQ_AVG) else: datapkg = [] for i in range(NROWS): if conf.NAIVE: ## naive implementaton datapkg.append([samples_evs, evs, SD_evs]) else: # Guiming 3/31/2019 datapkg.append([samples_evs, REVS, SD_evs, AVG_evs, SUM_DIF_SQ_AVG, msr_levels]) #print 'n process', nprocess pool = Pool(nprocess) t0 = time.time() if conf.NAIVE: ## naive implementaton sim = np.array(pool.map(simLoc2SamplesV0, X, datapkg)) else: sim = np.array(pool.map(simLoc2Samples, X, datapkg)) conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0) pool.clear() return sim except Exception as e: raise
def compute_ota_relative_coordinates( self ): Logger.get_instance().info( 'Starting the computation of relative ORF start and stop coordinates' + ' (registered in the ORFTranscriptAsso table).') # Get information related to the ORF # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table: # - Its unique ID in the database # - The ID of its ORF-related entry, as well as the chromosome, # start and stop positions of the ORF # NB: Query is performed using raw SQL statement for better efficiency orf_info_sql_statement = 'SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id,\ ORF.chromosome, ORF.start_pos, ORF.stop_pos AS end_pos \ FROM ORF \ INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.orf_id = ORF.id' if ( not self.force_overwrite ): orf_info_sql_statement += ' WHERE ( ORFTranscriptAsso.rel_start_pos IS NULL ) \ OR ( ORFTranscriptAsso.rel_stop_pos IS NULL)' orf_info_df = pd.read_sql( orf_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() # Get information related to the transcript # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table: # - Its unique ID in the database # - The ID of its Transcript-related entry # NB: All "UNKNOWN_TRANSCRIPT" entries are excluded as an official ID is needed to perform # the conversion. # NB: Query is performed using raw SQL statement for better efficiency transcript_info_sql_statement = "SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.transcript_id, \ Transcript.transcript_id AS tr_id \ FROM Transcript \ INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.transcript_id = Transcript.id \ WHERE Transcript.transcript_id != '" + Constants.UNKNOWN_TRANSCRIPT + "'" transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() ) SQLManagerPRO.get_instance().close_session() # Merge information from the two data frames in order to get # a data frame with the following columns: # - id: The ORFTranscriptAsso unique ID # - orf_id: The ORF unique ID # - chromosome: The ORF chromosome name # - start_pos: The ORF start position # - end_pos: The ORF stop position # - transcript_id: The Transcript unique ID # - tr_id: The transcript official ID (e.g. Ensembl ID) ota_info_df = orf_info_df.merge( transcript_info_df, on='id', how = 'inner', validate = 'one_to_one' ) Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_ota_relative_coordinates(): ' + str( ota_info_df.shape[0] ) + ' ORFTranscriptAsso entries are' + ' expected to be processed.') # As the conversion of coordinates in R may be highly time-consuming, # split the data frame into small data frames and multi-process the # computation # Split the data frame into smaller data frames that can be processed # independently from each other subset_data_frames = [ ota_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \ for min_bound in xrange( 0, ota_info_df.shape[ 0 ], Constants.MAX_ENTRIES_PER_DATAFRAME ) ] # For each of the subset data frame, process it with R in order # to build a dataset containing the start and stop relative # coordinates. # Instantiate the list of tuple-embedded arguments necessary to # compute the relative coordinates args_to_run_r = [] filename_prefix = self.OTA_CSV_FILE_PREFIX filename_suffix = 0 for df in subset_data_frames: args_to_run_r.append( ( df, self.species, self.ensembl_release_version, filename_prefix, filename_suffix ) ) filename_suffix += 1 # Instantiate the pool of processes p = Pool( self.thread_nb ) messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r ) p.close() # Wait for all processes to be completed p.join() # Log the messages generated by the processes for messages in messages_to_log: ( debug_messages_to_log, stdout, stderr ) = messages for message in debug_messages_to_log: Logger.get_instance().debug( message ) if ( stdout != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following standard output: \n' + stdout ) # NB: As the R function is susceptible to write not error-related # messages in stderr, these messages are also logged at the # debug level if ( stderr != '' ): Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' + ' The R script returned the following error output: \n' + stderr ) # Sequentially open CSV files to get the relative positions # Instantiate a dictionary that associate to the ORFTranscriptAsso ID # the relative start and stop positions of the ORF rel_positions_dict = {} for file_nb in range( filename_suffix ): df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER, filename_prefix + str( file_nb ) + '.csv' ), sep = ',', encoding = 'utf-8' ) for ( index, row ) in df.iterrows(): rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] ) # Add the relative start and stop positions for all the ORFTranscriptAsso entries all_ota = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso ).filter( ORFTranscriptAsso.id.in_( rel_positions_dict.keys() ) ).all() for ota in all_ota: # Get the start and stop positions positions = rel_positions_dict.get( ota.id ) rel_start_pos = positions[ 0 ] rel_stop_pos = positions[ 1 ] if not pd.isna( rel_start_pos ): ota.rel_start_pos = int( rel_start_pos ) if not pd.isna( rel_stop_pos ): ota.rel_stop_pos = int( rel_stop_pos ) # Commit the updates and close the session SQLManagerPRO.get_instance().commit() SQLManagerPRO.get_instance().close_session() # Delete the pool instance p.clear()
class RPKI_File: """This class gets validity data from ripe""" __slots__ = ["path", "total_lines", "_process"] _dir = "/tmp/" hosted_name = "upo_csv_path.csv.gz" port = 8000 def __init__(self, table_input): """Downloads and stores roas from a json""" self.path = self._dir + self.hosted_name.replace(".gz", "") with Unique_Prefix_Origins_Table(clear=True) as _db: _db.fill_table(table_input) _db.copy_table(self.path) self.total_lines = utils.get_lines_in_file(self.path) self._gzip_file() ################################# ### Context Manager Functions ### ################################# def __enter__(self): """What to do when the context manager is called on this class Starts the process for serving the file""" self.spawn_process() return self def __exit__(self, type, value, traceback): """Closes the file process""" self.close() ############################ ### Serve File Functions ### ############################ def spawn_process(self): """Spawns file serving process""" utils.kill_port(self.port) self._process = ProcessingPool() self._process.apipe(self._serve_file) logging.debug("Served RPKI File") def close(self): """Closes file process""" utils.kill_port(self.port, wait=False) self._process.close() self._process.terminate() self._process.join() self._process.clear() # changed to absolute path utils.delete_paths(os.path.join(self._dir, self.hosted_name)) logging.debug("Closed RPKI File") ######################## ### Helper Functions ### ######################## def _gzip_file(self): """gzips the file for proper formatting in rpki validator""" with open(self.path, 'rb') as f_in, gzip.open( os.path.join(self._dir, self.hosted_name), 'wb') as f_out: f_out.writelines(f_in) utils.delete_paths(self.path) def _serve_file(self): """Makes a simple http server and serves a file in /tmp""" class Handler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Changes directory to be in /tmp os.chdir(self._dir) # Serve the file on port 8000 socketserver.TCPServer(("", RPKI_File.port), Handler).serve_forever()
def make_patches(data_root, patches_root, patch_size, outline_filled=None, remove_filled=False, min_widths=('def',), mirror=True, rotations=(0,), translations=((0, 0),), distinguishability_threshold=.5, num_workers=0, random_samples=None, leave_width_percentile=None): if num_workers != 0: from pathos.multiprocessing import cpu_count, ProcessingPool from pathos.threading import ThreadPool if num_workers == -1: optimal_workers = cpu_count() - 1 workers_pool = ProcessingPool(optimal_workers) else: workers_pool = ProcessingPool(num_workers) print(f'Workers pool: {workers_pool}') savers_pool = ThreadPool(1) saving_patches_in_bg = savers_pool.amap(lambda a: None, []) else: workers_pool = 0 path = lambda basename, origin, width='def', ori='def', rot=0, t=(0, 0): os.path.join(patches_root, basename, '{}x{}'.format(*patch_size), 'width_{}'.format(width), 'orientation_{}'.format(ori), 'rotated_deg_{}'.format(rot), 'translated_{}_{}'.format(*t), '{}_{}.svg'.format(*origin)) orientations = ['def'] if mirror: orientations.append('mir') if random_samples is not None: min_widths_all = deepcopy(min_widths) orientations_all = deepcopy(orientations) rotations_all = deepcopy(rotations) translations_all = deepcopy(translations) source_images = glob(os.path.join(data_root, '**', '*.svg'), recursive=True) for file in source_images: print('Processing file {}'.format(file)) basename = file[len(data_root) + 1:-4] # split data_root and extension vector_image = VectorImage.from_svg(file) if remove_filled: vector_image.remove_filled() if outline_filled is not None: vector_image.leave_only_contours(outline_filled) if leave_width_percentile is not None: vector_image.leave_width_percentile(leave_width_percentile) if random_samples is not None: min_widths = np.random.choice(min_widths_all, size=min(random_samples, len(min_widths_all)), replace=False) orientations = np.random.choice(orientations_all, size=min(random_samples, len(orientations_all)), replace=False) rotations = np.random.choice(rotations_all, size=min(random_samples, len(rotations_all)), replace=False) translations = translations_all[ np.random.choice(len(translations_all), size=min(random_samples, len(translations_all)), replace=False)] for width in min_widths: print('\twidth {}'.format(width)) if width == 'def': vector_image_scaled = vector_image else: vector_image_scaled = vector_image.copy() vector_image_scaled.scale_to_width('min', width) for orientation in orientations: print('\t\torientation {}'.format(orientation)) if orientation == 'def': vector_image_reoriented = vector_image_scaled else: vector_image_reoriented = vector_image_scaled.mirrored() for rotation in rotations: print('\t\t\trotation {}'.format(rotation)) vector_image_rotated = vector_image_reoriented.rotated(rotation, adjust_view=True) for translation in translations: print('\t\t\t\ttranslation {}'.format(translation)) vector_image_translated = vector_image_rotated.translated(translation, adjust_view=True) vector_patches = vector_image_translated.split_to_patches(patch_size, workers=workers_pool) if num_workers != 0: print('\t\t\t\t\twaiting for previous batch to be saved') saving_patches_in_bg.get() def simplify_and_save(vector_patch, basename=basename, width=width, orientation=orientation, rotation=rotation, translation=translation): vector_patch.simplify_segments(distinguishability_threshold=distinguishability_threshold) if len(vector_patch.paths) == 0: return save_path = path(basename, (int(vector_patch.x.as_pixels()), int(vector_patch.y.as_pixels())), width, orientation, rotation, translation) os.makedirs(os.path.dirname(save_path), exist_ok=True) vector_patch.save(save_path) if num_workers == 0: print('\t\t\t\t\tsaving patches') for vector_path in vector_patches.reshape(-1): simplify_and_save(vector_path) else: print('\t\t\t\t\tsaving patches') saving_patches_in_bg = savers_pool.amap(simplify_and_save, vector_patches.reshape(-1)) if num_workers != 0: workers_pool.close() workers_pool.join() workers_pool.clear() savers_pool.close() savers_pool.join() savers_pool.clear()
def build_full_hamiltonian_parallel1(clustered_ham_in, ci_vector_in, iprint=1, nproc=None, opt_einsum=True): """ Build hamiltonian in basis in ci_vector parallelized over fock space blocks -- inefficient """ # {{{ global clusters global ci_vector global clustered_ham print(" In build_full_hamiltonian_parallel1. nproc=", nproc) clustered_ham = clustered_ham_in ci_vector = ci_vector_in clusters = clustered_ham_in.clusters H = np.zeros((len(ci_vector), len(ci_vector))) n_clusters = len(clusters) def compute_parallel_block(f): fock_l = f[0] fock_li = f[1] fock_r = f[2] fock_ri = f[3] diagonal = False if fock_l == fock_r: diagonal = True if fock_li > fock_ri: return #print("Processing the block: ") #print(fock_l,fock_r) configs_l = ci_vector[fock_l] configs_r = ci_vector[fock_r] Hblock = np.zeros((len(configs_l), len(configs_r))) delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0], fock_l[ci][1] - fock_r[ci][1]) for ci in range(len(clusters))]) try: terms = clustered_ham.terms[delta_fock] except KeyError: return for term in terms: # Compute the state sign now - since it only depends on fock spaces state_sign = 1 term_exists = True for oi, o in enumerate(term.ops): if o == '': continue if len(o) == 1 or len(o) == 3: for cj in range(oi): state_sign *= (-1)**(fock_r[cj][0] + fock_r[cj][1]) # Check to make sure each cluster is allowed to make the requested transition try: do = clusters[oi].ops[o] except: print(" Couldn't find:", term) exit() try: d = do[(fock_l[oi], fock_r[oi])] #d = do[(fock_bra[oi],fock_ket[oi])][bra[oi],ket[oi]] #D(I,J,:,:...) except: term_exists = False if not term_exists: continue for config_li, config_l in enumerate(configs_l): idx_l = config_li #idx_l = fock_space_shifts[fock_li] + config_li for config_ri, config_r in enumerate(configs_r): idx_r = config_ri #idx_r = fock_space_shifts[fock_ri] + config_ri if diagonal and idx_r < idx_l: continue # Check to make sure each cluster is diagonal except if active allowed = True for ci in range(n_clusters): if (config_l[ci] != config_r[ci]) and (ci not in term.active): allowed = False if not allowed: continue me = term.matrix_element(fock_l, config_l, fock_r, config_r) # #d = do[(fock_bra[oi],fock_ket[oi])][bra[oi],ket[oi]] #D(I,J,:,:...) # mats = [] # for ci in term.active: # mats.append( clusters[ci].ops[term.ops[ci]][(fock_l[ci],fock_r[ci])][config_l[ci],config_r[ci]] ) # # me = 0.0 # # if len(mats) != len(term.active): # continue # # #check that the mats where treated as views and also contiguous # #for m in mats: # # print(m.flags['OWNDATA']) #False -- apparently this is a view # # print(m.__array_interface__) # # print() # # # todo: # # For some reason, precompiled contract expression is slower than direct einsum - figure this out # #me = term.contract_expression(*mats) * state_sign # me = np.einsum(term.contract_string,*mats,term.ints) * state_sign Hblock[idx_l, idx_r] += me if diagonal and idx_r > idx_l: Hblock[idx_r, idx_l] += me return Hblock fock_space_shifts = [0] for fi, f in enumerate(ci_vector.fblocks()): configs_i = ci_vector[f] fock_space_shifts.append(fock_space_shifts[-1] + len(configs_i)) fock_space_blocks = [] for fock_li, fock_l in enumerate(ci_vector.data): for fock_ri, fock_r in enumerate(ci_vector.data): if fock_li > fock_ri: continue fock_space_blocks.append((fock_l, fock_li, fock_r, fock_ri)) #for f in fock_space_blocks: # compute_parallel_block(f) import multiprocessing as mp from pathos.multiprocessing import ProcessingPool as Pool if nproc == None: pool = Pool() else: pool = Pool(processes=nproc) def test(f): fock_l = f[0] fock_li = f[1] fock_r = f[2] fock_ri = f[3] if fock_li > fock_ri: return print(fock_l, fock_r) configs_l = ci_vector[fock_l] configs_r = ci_vector[fock_r] #pool.map(test, fock_space_blocks) Hblocks = pool.map(compute_parallel_block, fock_space_blocks) pool.close() pool.join() pool.clear() for fi, f in enumerate(fock_space_blocks): fock_l = f[0] fock_li = f[1] fock_r = f[2] fock_ri = f[3] start_l = fock_space_shifts[fock_li] stop_l = fock_space_shifts[fock_li + 1] start_r = fock_space_shifts[fock_ri] stop_r = fock_space_shifts[fock_ri + 1] if np.all(Hblocks[fi]) != None: H[start_l:stop_l, start_r:stop_r] = Hblocks[fi] if fock_l != fock_r: if np.all(Hblocks[fi]) != None: H[start_r:stop_r, start_l:stop_l] = Hblocks[fi].T #try: #if np.all(Hblocks[fi]) != None: #try: # H[start_r:stop_r,start_l:stop_l] = Hblocks[fi].T #except: # pass return H