コード例 #1
0
    def multi_word_cut(self, sentences):
        print('Multiprocessing Word cut ')
        if self.language == 'ch':
            jieba.initialize(
            )  # initialize first, or it will initialize in each process
            jieba.disable_parallel()

            def func(line):
                line = [i.strip() for i in jieba.cut(line, cut_all=False)]
                return [
                    i for i in line
                    if ((not i.isdigit()) and (i not in self.stop_words))
                ]
        else:

            def func(line):
                return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \
                                                       (i not in self.stop_words) and \
                                                       (len(i) >1 ) )]

        pool = Pool(nodes=5)
        t0 = time.time()
        word_cut = pool.map(func, sentences)
        pool.close()
        pool.join()
        pool.clear()
        print('MultiProcess  time {:.0f}'.format(time.time() - t0))
        return word_cut
コード例 #2
0
def GroupByParallelProcess(tweetsDF, cores, groupMethod):
    """
    Group by and aggregate on time via a parallel process
    """

    tweetsDF.label_date = tweetsDF.label_date.astype(int)
    tweetsDF = tweetsDF.set_index("label_date")
    # Parallelizing using Pool.apply()
    df_split = GetListOfSplitDFs(tweetsDF, cores)
    # create the multiprocessing pool
    pool = Pool(cores)
    # process the DataFrame by mapping function to each df across the pool
    logging.info("Starting the grouping and aggregating process.")
    if groupMethod == "weighted-average":
        df_out = pool.map(PerformGroupbyAndAggregate, df_split)
    elif groupMethod == "sum":
        df_out = pool.map(PerformSum, df_split)
    elif groupMethod == "mean":
        df_out = pool.map(PerformMean, df_split)
    else:
        logging.error("Choose correct group by method.")
        return None

    # close down the pool and join
    pool.close()
    pool.join()
    pool.clear()

    logging.info("Ended the grouping and aggregating process.")

    return df_out
コード例 #3
0
ファイル: helper_funcs.py プロジェクト: jfuruness/lib_utils
def Pool(cpus=cpu_count()) -> ProcessingPool:
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with processes
    p = ProcessingPool(cpus)
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #4
0
def parallelize_dataframe(df,
                          func,
                          num_partitions=num_cores,
                          num_cores=num_cores):
    df_split = np.array_split(df, num_partitions, axis=0)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    pool.clear()
    return df
コード例 #5
0
ファイル: utils.py プロジェクト: jfuruness/lib_bgp_data
def Pool(threads: int, multiplier: int, name: str):
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with threads else cpu_count * multiplier
    p = ProcessingPool(threads if threads else cpu_count() * multiplier)
    logging.debug(f"Created {name} pool")
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #6
0
ファイル: multiprocessing.py プロジェクト: notsoprocoder/mupa
 def parallel_apply(self, df, func):
     # add try statement re function not returning a DataFrame
     if self.preprocessing_checks(df, func):
         # split DataFrame into a list of smaller DataFrames
         self.df_split = np.array_split(df, self.partitions, axis=0)
         # create the multiprocessing pool
         pool = Pool(self.cores)
         # process the DataFrame by mapping function to each df across the pool
         df = pd.concat(pool.map(func, self.df_split), axis=0).copy()
         # close down the pool and join
         pool.close()
         pool.join()
         pool.clear()
         return df
コード例 #7
0
    def make_query(self, size=1):

        ## quit if nr_unlabeled_samples = 1
        if self.dataset.len_unlabeled() == 1:
            return self.dataset.get_unlabeled_entries()[0].astype(int)

        ## Set the possible labels
        self.possible_labels = list(set(self.dataset.get_labeled_entries()[1]))

        ## Train the model
        self.model.train(self.dataset)

        ## Get probabilities
        X_ids, X = self.dataset.get_unlabeled_entries()
        pred = self.model.predict_proba(
            X)  # pred.shape = (n_unlabeled, nr_of_labels)

        ## Setup pool for cpu parallelisation
        p = Pool(cpu_count(), maxtasksperchild=1000)

        ## nr of unlabeled samples -> len(X)

        ## Get uncertainty after adding every sample with every label
        total = np.asarray(
            p.map(self._eer, X_ids,
                  len(X) * [self.dataset],
                  len(X) * [self.depth]))
        # total.shape = (n_unlabeled, nr_of_labels)

        ## Close the Pool again
        p.close()
        p.join()
        p.clear()

        ## Get the total uncertainty of one sample after adding a label weighted by the labels probability
        total = np.inner(
            pred,
            total,
        ).diagonal()  # total.shape = (n_unlabeled,)

        ## Zip it
        total = zipit(X_ids, total)

        ## Sort it
        results = sort_by_2nd(total, 'min')

        return results[:size, 0].astype(int)
コード例 #8
0
    def start(self, text_data_dir, res_dir, nprocs=8):
        '''
        entry function
    
        text_data_dir: folder of raw data
        text_res_dir: folder of output
        verbose: int. Information is printed every N records
        nprocs: number of cores in parallel
        '''
        p = PathosPool(nprocs)

        filepathsvec, filenamesvec, respaths = list(), list(), list()
        for dirpath, _, filenames in os.walk(text_data_dir):
            for filename in filenames:
                if (("gz" in filename) and ('md5' not in filename)
                        and ('copy' not in filename)):
                    filepath = os.path.join(dirpath, filename)
                    print(filepath)
                    res_name = filename.split(".")[0] + ".csv.gz"
                    respath = os.path.join(res_dir, res_name)
                    #if os.path.exists(respath):
                    # pass
                    #else:
                    if True:
                        filepathsvec.append(filepath)
                        filenamesvec.append(filename)
                        respaths.append(respath)
                        #p.apply_async(process_data, args = (filepath,filename,
                        # respath, True,
                        # [title_stop_path,
                        #  affil_stop_path,
                        #  mesh_stop_path]))
        self.affildicts = p.amap(
            partial(self.process_data,
                    stop_paths=[
                        self.title_stop_path, self.affil_stop_path,
                        self.mesh_stop_path
                    ],
                    rm_stopwords=True,
                    affiliation_correction=True,
                    select_journals=self.select_journals), filepathsvec,
            filenamesvec, respaths)

        p.close()
        p.join()  # Having an issue joining
        print("joined")
        p.clear()  # Delete the pool
コード例 #9
0
    def run(
            self,
            percent_attackers_list=[x / 100 for x in range(1, 92, 5)],
            managers=Manager.paper_managers,
            attackers=Attacker.paper_attackers,
            # Note that for range, last number is not included
            num_buckets=1,
            # Note that this is the users per bucket, not total users
            users_per_bucket=10,
            num_rounds=2,
            trials=2):
        """Runs in parallel every possible scenario

        Looks complicated, but no real way to simplify it
        so deal with it"""

        p = ProcessingPool(nodes=cpu_count())
        full_args = [[percent_attackers_list] * trials, [attackers] * trials,
                     [num_buckets] * trials, [users_per_bucket] * trials,
                     [num_rounds] * trials, [managers] * trials,
                     list(range(trials)), [trials] * trials]

        # If we are debugging, no multiprocessing
        # https://stackoverflow.com/a/1987484/8903959
        # https://stackoverflow.com/a/58866220/8903959
        if self.debug or "PYTEST_CURRENT_TEST" in os.environ:
            results = []
            for trial_num in range(trials):
                args = [x[trial_num] for x in full_args]
                results.append(self.get_combo_data(*args))
        else:
            # Doesn't make sense to do tqdm here since they finish all at once
            results = p.map(self.get_combo_data, *full_args)
            p.close()
            p.join()
            p.clear()
        # Get rid of carriage returns
        print()
        return self._aggregate_results(results, managers, attackers,
                                       percent_attackers_list)
コード例 #10
0
ファイル: pathos_pool.py プロジェクト: CGCL-codes/naturalcc
class PPool:
    """pathos multi-processing pool"""

    def __init__(self, processor_num: int = None, ):
        self.processor_num = cpu_count() if processor_num is None \
            else min(processor_num, cpu_count())
        LOGGER.debug('Building Pathos multi-processing pool with {} cores.'.format(self.processor_num))
        self._pool = Pool(self.processor_num)

    def flatten_params(self, params: List):
        """params: List[*args, **kwargs]"""
        # block_size = int(math.ceil(len(params) / self.processor_num))
        # block_num = int(math.ceil(len(params) / block_size))
        block_size = (len(params) + self.processor_num - 1) // self.processor_num
        block_num = (len(params) + block_size - 1) // block_size
        block_params = [params[i * block_size:(i + 1) * block_size] for i in range(block_num)]
        return block_params

    def close(self):
        self._pool.close()
        self._pool.join()
        self._pool.clear()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def feed(self, func: Any, params: List, one_params: bool = False) -> List[Any]:
        if one_params:
            result = self._pool.amap(func, params).get()
        else:
            params = tuple(zip(*params))
            result = self._pool.amap(func, *params).get()
        return result
コード例 #11
0
    def run(self,
            ddos_sim_cls_list=None,
            managers=Manager.runnable_managers,
            attackers=Attacker.runnable_attackers,
            # Note that for range, last number is not included
            num_buckets_list=[1],
            # Note that this is the users per bucket, not total users
            users_per_bucket_list=[10 ** i for i in range(4, 6)],
            num_rounds_list=[10 ** i for i in range(3, 5)],
            trials=10):
        """Runs in parallel every possible scenario

        Looks complicated, but no real way to simplify it
        so deal with it"""

        if ddos_sim_cls_list is None:
            ddos_sim_cls_list =\
                [ddos_simulator.DDOS_Simulator.runnable_simulators[0]]

        # Initializes graph path
        self.make_graph_dir(destroy=True)

        # Total number of scenarios
        pbar_total = (len(ddos_sim_cls_list) *
                      len(num_buckets_list) *
                      len(users_per_bucket_list) *
                      len(num_rounds_list) *
                      (len(attackers) + 1))  # Add 1 to attacker for worst case

        _pathos_simulators_list = []
        _pathos_num_buckets_list = []
        _pathos_users_per_bucket = []
        _pathos_num_rounds = []
        for num_buckets in num_buckets_list:
            for users_per_bucket in users_per_bucket_list:
                for num_rounds in num_rounds_list:
                    for attacker in attackers + [Worst_Case_Attacker]:
                        for sim_cls in ddos_sim_cls_list:
                            self.get_attacker_graph_dir(attacker)

                        _pathos_simulators_list.append(sim_cls)
                        _pathos_num_buckets_list.append(num_buckets)
                        _pathos_users_per_bucket.append(users_per_bucket)
                        _pathos_num_rounds.append(num_rounds)

        p = ProcessingPool(nodes=cpu_count())
        total = len(_pathos_num_rounds)
        full_args = [_pathos_simulators_list,
                     [attackers] * total,
                     _pathos_num_buckets_list,
                     _pathos_users_per_bucket,
                     _pathos_num_rounds,
                     [managers] * total,
                     [trials] * total,
                     list(range(total)),
                     list([pbar_total] * total)]

        # If we are debugging, no multiprocessing
        # https://stackoverflow.com/a/1987484/8903959
        if (self.stream_level == Log_Levels.DEBUG
            # https://stackoverflow.com/a/58866220/8903959
            or "PYTEST_CURRENT_TEST" in os.environ):

            for i in range(total):
                try:
                    current_args = [x[i] for x in full_args]
                    self.get_graph_data(*current_args)
                except Exception as e:
                    from pprint import pprint
                    pprint(current_args)
                    raise e
        else:
            p.map(self.get_graph_data, *full_args)
            p.close()
            p.join()
            p.clear()
        # Get rid of carriage returns
        print()
コード例 #12
0
def create_sigmats_3_scales(dataset,
                            no_sensors_cols,
                            win_size_ls,
                            normalize_each_seq=False,
                            warm_up_time_points=''):
    """recives df of the data,
     no_sensors_cols (ls): the columns that doesnt represent sensors
     win_size_ls (ls): win sizes to produce (each one will be a channel in reverse order)
     warm_up_time_points

    returns list of  representations (sigmat) with n  dim (number of channels) for each scale, for each iter
    - X(PADED TO THE MAX LENGTH) shape = (num of seqs, length of seq, num of sensors/features)
    - y and
    - keys ('drone', 'update_step', 'iter') for later identification """

    # compute y - if one of the recorsed is anomaly, all the sequnce classified as anomaly
    iter_ls = dataset.iter.unique()

    def create_sigmats_of_one_iter(dataset, iteri):
        # get current iter
        dataset_iteri = dataset.loc[dataset['iter'] == iteri, :]
        # get list of update steps
        update_step_ls = dataset_iteri.update_step.to_list()
        step_sig_mat_ls = []

        for update_step in update_step_ls:
            # print('iter: ',iteri,'step: ', update_step)

            win_sig_mat_ls = []
            for win_size in win_size_ls:
                # cut the df by current update step-win size
                current_seq = dataset_iteri.loc[
                    (dataset_iteri['update_step'] <= update_step) &
                    (dataset_iteri['update_step'] > (update_step - win_size))]
                # drop irrelevant cols and convert to numpy
                current_seq = current_seq.drop(no_sensors_cols + ['label'],
                                               1).to_numpy()
                if normalize_each_seq:
                    current_seq = StandardScaler().fit_transform(current_seq)
                # convert to sig mat
                current_seq_sig_mat = seq_to_sig_matrix(current_seq)
                # add to thr ls -each elemnt with different win size
                win_sig_mat_ls.append(current_seq_sig_mat)

            # stack the 3 win size (scale) togather as channels
            # stacked_mats_different_scale = np.stack(win_sig_mat_ls)
            # add to step ls
            step_sig_mat_ls.append(win_sig_mat_ls)
        # stack all steps
        # stacked_mats_of_iter = np.stack(step_sig_mat_ls)
        # add to iter ls
        iter_sig_mat_np = np.array(step_sig_mat_ls)
        iter_sig_mat_np = np.rollaxis(np.array(iter_sig_mat_np), 1, 4)
        return {
            'sig_mat': iter_sig_mat_np,
            'keys': dataset_iteri[['drone', 'update_step', 'iter']],
            'labels': dataset_iteri.label.to_numpy()
        }

    workers = multiprocessing.cpu_count()
    print('Number of workers: ', workers)
    pool = ProcessingPool(workers)
    list_of_iters_dict = pool.map(
        lambda iter: create_sigmats_of_one_iter(dataset, iter), iter_ls)
    pool.close()
    pool.join()
    pool.terminate()
    pool.clear()

    iters_sig_mat_ls = [
        iter_dict['sig_mat'] for iter_dict in list_of_iters_dict
    ]
    iters_lables_ls = [iter_dict['labels'] for iter_dict in list_of_iters_dict]
    iters_keys_ls = [iter_dict['keys'] for iter_dict in list_of_iters_dict]

    print(
        'shape of first iter X {} shape of first iter labels {} shape keys {}'.
        format(iters_sig_mat_ls[0].shape, iters_lables_ls[0].shape,
               iters_keys_ls[0].shape))

    return iters_sig_mat_ls, iters_lables_ls, iters_keys_ls
コード例 #13
0
ファイル: Thought2Vec.py プロジェクト: adairaar/Thought2Vec
    def run(self):
        """
        Main method of the Word2Vec class.
        :return: the final values of the weights W1, W2 and a history of the value of the loss function vs. epoch
        """
        if len(self.corpus) == 0:
            raise ValueError('You need to specify a corpus of text.')

        print("Creating one-hot student answer vectors")
        cores = mp.cpu_count()
        stu_dict = {}
        df_split = np.array_split(self.corpus, cores, axis=0)

        # create the multiprocessing pool
        pool = Pool(cores)

        # process the DataFrame by mapping function to each df across the pool
        df_out = np.vstack(pool.map(self.onehotvecs, df_split))

        # close down the pool and join
        pool.close()
        pool.join()
        pool.clear()

        print("Creating student answer dictionary")
        row_count = 0
        for i in range(0, cores):
            for j in range(0, len(df_out[i][0])):
                stu_dict[row_count] = df_out[i][0][j]
                row_count += 1

        # initialize weight matrices
        print("Initializing weights")
        V = stu_dict[0].shape[1]
        W1, W2 = initialize(V, self.N)

        loss_vs_epoch = []
        loss_low = np.inf

        print("Begining training")
        for e in trange((self.n_epochs), desc='Epochs'):
            loss = 0.0
            rand_student_order = np.random.choice(self.corpus.shape[0],
                                                  self.corpus.shape[0],
                                                  replace=False)
            # shuffle data without replacement
            for i in tqdm(rand_student_order, desc='Students', leave=False):
                for center, context in self.trainTargetDF(stu_dict[i]):
                    W1, W2, loss = self.method(context, center, W1, W2, loss)
            loss_vs_epoch.append(loss)

            if loss < loss_low:
                loss_low = loss
                W1_best, W2_best = W1, W2

            # Early stopping and returning best result
            if loss > loss_vs_epoch[max(0, e - self.early_stop)]:
                print("Training complete. Loss now increasing.")
                return W1_best, W2_best, loss_vs_epoch

        print("Training complete.")
        return W1, W2, loss_vs_epoch
コード例 #14
0
def build_full_hamiltonian_parallel2(clustered_ham_in,
                                     ci_vector_in,
                                     iprint=1,
                                     nproc=None,
                                     opt_einsum=True,
                                     thresh=1e-14):
    """
    Build hamiltonian in basis in ci_vector

    parallelized over matrix elements
    """
    # {{{
    global clusters
    global ci_vector
    global clustered_ham

    print(" In build_full_hamiltonian_parallel2. nproc=", nproc)

    clustered_ham = clustered_ham_in
    clusters = clustered_ham_in.clusters
    ci_vector = ci_vector_in

    H = np.zeros((len(ci_vector), len(ci_vector)))
    n_clusters = len(clusters)

    def do_parallel_work(v_curr):
        fock_l = v_curr[0]
        conf_l = v_curr[1]
        idx_l = v_curr[2]

        out = []

        idx_r = -1
        for fock_r in ci_vector.fblocks():
            confs_r = ci_vector[fock_r]
            delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0],
                                 fock_l[ci][1] - fock_r[ci][1])
                                for ci in range(len(clusters))])
            try:
                terms = clustered_ham.terms[delta_fock]

            except KeyError:
                idx_r += len(confs_r)
                continue

            for conf_r in confs_r:
                idx_r += 1

                if idx_l > idx_r:
                    continue

                me = 0
                for term in terms:
                    me += term.matrix_element(fock_l, conf_l, fock_r, conf_r)

                #if abs(me) > thresh:
                out.append((idx_r, me))

        return out
#    def parallel_work(inp):
#        fock_l = inp[0]
#        fock_r = inp[1]
#        conf_l = inp[2]
#        conf_r = inp[3]
#        idx_l  = inp[4]
#        idx_r  = inp[5]
#        out = [idx_l, idx_r, None]
#
#        delta_fock= tuple([(fock_l[ci][0]-fock_r[ci][0], fock_l[ci][1]-fock_r[ci][1]) for ci in range(len(clusters))])
#        try:
#            terms = clustered_ham.terms[delta_fock]
#
#                for config_ri, config_r in enumerate(configs_r):
#                    idx_r = shift_r + config_ri
#                    if idx_r<idx_l:
#                        continue
#
#                    for term in terms:
#                        me = term.matrix_element(fock_l,config_l,fock_r,config_r)
#                        H[idx_l,idx_r] += me
#                        if idx_r>idx_l:
#                            H[idx_r,idx_l] += me
#                        #print(" %4i %4i = %12.8f"%(idx_l,idx_r,me),"  :  ",config_l,config_r, " :: ", term)
#
#        except KeyError:
#            continue

    rows = []
    idx_row = 0
    for fock1, conf1, coeff1 in ci_vector:
        rows.append((fock1, conf1, idx_row))
        idx_row += 1

    import multiprocessing as mp
    from pathos.multiprocessing import ProcessingPool as Pool

    if nproc == None:
        pool = Pool()
    else:
        pool = Pool(processes=nproc)

    Hrows = pool.map(do_parallel_work, rows)

    pool.close()
    pool.join()
    pool.clear()

    for row_idx, row in enumerate(Hrows):
        for col_idx, term in row:
            assert (col_idx >= row_idx)
            H[row_idx, col_idx] = term
            H[col_idx, row_idx] = term

    return H
コード例 #15
0
 def compute_tr_cds_relative_coordinates( self ):
     
     Logger.get_instance().info( 'Starting the computation of relative CDS transcript start and stop' +
                                 ' coordinates (registered in the Transcript table).')
     
     # Get all the transcript for which there are CDS 
     # start and stop positions provided 
     # NB: Query is performed using raw SQL statement for better efficiency
     transcript_info_sql_statement = 'SELECT Transcript.id, Transcript.transcript_id AS tr_id, \
                                             Transcript.gene_id, PROGene.chromosome, \
                                             Transcript.cds_start_pos AS start_pos, \
                                             Transcript.cds_stop_pos AS end_pos \
                                      FROM Transcript \
                                      INNER JOIN PROGene ON PROGene.gene_id = Transcript.gene_id \
                                      WHERE ( Transcript.cds_start_pos IS NOT NULL ) \
                                            AND ( Transcript.cds_stop_pos IS NOT NULL )'
     if ( not self.force_overwrite ):
         transcript_info_sql_statement += ' AND ( ( Transcript.rel_cds_start_pos IS NULL ) \
                                                  OR ( Transcript.rel_cds_stop_pos IS NULL ) )'
                                             
     transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_tr_cds_relative_coordinates(): ' +
                                  str( transcript_info_df.shape[0] ) + ' Transcript entries are' +
                                  ' expected to be processed.')
             
     # As the conversion of coordinates in R may be highly time-consuming,
     # split the data frame into small data frames and multi-process the 
     # computation
     # Split the data frame into smaller data frames that can be processed
     # independently from each other  
     subset_data_frames = [ transcript_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \
                            for min_bound in xrange( 0,
                                                     transcript_info_df.shape[ 0 ],
                                                     Constants.MAX_ENTRIES_PER_DATAFRAME ) ]
     
     
     # For each of the subset data frame, process it with R in order
     # to build a dataset containing the start and stop relative
     # coordinates.
     # Instantiate the list of tuple-embedded arguments necessary to
     # compute the relative coordinates
     args_to_run_r = []
     filename_prefix = self.TRANSCRIPT_CSV_FILE_PREFIX
     filename_suffix = 0
     for df in subset_data_frames:
         args_to_run_r.append( ( df,
                                 self.species, 
                                 self.ensembl_release_version, 
                                 filename_prefix,
                                 filename_suffix ) )
         filename_suffix += 1
         
     # Instantiate the pool of processes
     p = Pool( self.thread_nb )
     messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r )
     p.close()
     # Wait for all processes to be completed
     p.join()
     
     # Log the messages generated by the processes
     for messages in messages_to_log:
         
         ( debug_messages_to_log,
           stdout,
           stderr ) = messages
           
         for message in debug_messages_to_log:
             Logger.get_instance().debug( message )
         
         if ( stdout != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following standard output: \n' + 
                                          stdout )
         
         # NB: As the R function is susceptible to write not error-related 
         #     messages in stderr, these messages are also logged at the 
         #     debug level
         if ( stderr != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following error output: \n' + 
                                          stderr )
     
     # Sequentially open CSV files to get the relative positions
     # Instantiate a dictionary that associate to the ORFTranscriptAsso ID
     # the relative start and stop positions of the ORF
     rel_positions_dict = {}
     for file_nb in range( filename_suffix ):
         
         df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER,
                                         filename_prefix + str( file_nb ) + '.csv' ),
                           sep = ',',
                           encoding = 'utf-8' )
         
         for ( index, row ) in df.iterrows():
             rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] )
     
     
     # Add the relative start and stop positions for all the ORFTranscriptAsso entries 
     all_transcripts = SQLManagerPRO.get_instance().get_session().query( 
                                                                             Transcript 
                                                                         ).filter( 
                                                                                     Transcript.id.in_( rel_positions_dict.keys() ) 
                                                                                 ).all()
                                                                                 
     for transcript in all_transcripts:
         
         # Get the start and stop positions
         positions = rel_positions_dict.get( transcript.id )
         rel_cds_start_pos = positions[ 0 ] 
         rel_cds_stop_pos = positions[ 1 ] 
         
         if not pd.isna( rel_cds_start_pos ):
             transcript.rel_cds_start_pos = int( rel_cds_start_pos )
         
         if not pd.isna( rel_cds_stop_pos ):
             transcript.rel_cds_stop_pos = int( rel_cds_stop_pos )
     
     # Commit the updates and close the session
     SQLManagerPRO.get_instance().commit()
     SQLManagerPRO.get_instance().close_session()
     
     # Delete the pool instance
     p.clear()
コード例 #16
0
ファイル: gdalwrapper.py プロジェクト: Guiming/DL4SDM
class tiledRasterReader:
    '''
    '''
    def __init__(self, srcRasterfile, xoff=0, yoff=0, xsize=None, ysize=None):
        '''
        '''
        #print('Initializing reader...')
        self.srcRasterfile = srcRasterfile
        gdal.SetCacheMax(2**30)  # 1 GB
        self.ds = gdal.Open(self.srcRasterfile, gdalconst.GA_ReadOnly)
        #print('self.ds: ', self.ds)

        if '.vrt' in self.srcRasterfile:
            self.fileList = self.ds.GetFileList()[1:]
            #print('self.fileList: ', self.fileList)
            self.measurement_level_ints = []
            for fn in self.fileList:
                # default level of measurement
                msrlevel = conf.MSR_LEVEL_RATIO
                for keyword in conf.NOMINAL_KEYWORD_IN_FN:
                    if keyword in fn:
                        msrlevel = conf.MSR_LEVEL_NOMINAL
                        break
                for key in conf.MSR_LEVELS:
                    if conf.MSR_LEVELS[key] == msrlevel:
                        self.measurement_level_ints.append(int(key))
                        break
            self.measurement_level_ints = np.array(self.measurement_level_ints)

        self.nbands = self.ds.RasterCount
        self.nrows = self.ds.RasterYSize
        self.ncols = self.ds.RasterXSize
        self.geotransform = self.ds.GetGeoTransform()
        self.projection = self.ds.GetProjection()
        print('%s:\n\t%d rows %d columns' %
              (self.srcRasterfile, self.nrows, self.ncols))

        band = self.ds.GetRasterBand(1)
        self.nodata = band.GetNoDataValue()

        ## each band may have a different nodata value
        nodatas = []
        for b in range(1, self.nbands + 1):
            #print('band %d nodata: %.2f' % (b, self.ds.GetRasterBand(b).GetNoDataValue()))
            nodatas.append(self.ds.GetRasterBand(b).GetNoDataValue())
        self.nodatas = np.array(nodatas)
        '''
        for i in range(1, self.nbands + 1):
            b = self.ds.GetRasterBand(1)
            nd = b.GetNoDataValue()
            print('band %d nd %.2f' % (i, nd))
        '''
        self.block_ysize_base = band.GetBlockSize()[0]
        #print('self.fileList', self.fileList)
        if '.vrt' in self.srcRasterfile:
            self.block_xsize_base = gdal.Open(
                self.fileList[0],
                gdalconst.GA_ReadOnly).GetRasterBand(1).GetBlockSize()[0]
        else:
            #self.block_xsize_base = self.ds.GetRasterBand(1).GetBlockSize()[1]
            self.block_xsize_base = band.GetBlockSize()[1]

        #print('\t%d x %d' % (self.block_xsize_base, self.block_ysize_base))

        self.__N_TilesRead = 0
        self.xoff, self.yoff = xoff, yoff

        if xsize is None:
            self.xsize = self.block_xsize_base
        elif xsize > self.ncols:
            print('tile xsize exceeds RasterXsize %d' % self.ncols)
            sys.exit(1)
        else:
            self.xsize = xsize

        if ysize is None:
            self.ysize = self.block_ysize_base
        elif ysize > self.nrows:
            print('tile xsize exceeds RasterYsize %d' % self.nrows)
            sys.exit(1)
        else:
            self.ysize = ysize

        ## estimated data size (in MB)
        self.estimate_TotalSize_MB = self.estimateTileSize_MB(
            self.nrows, self.ncols)
        self.estimate_TileSize_MB = self.estimateTileSize_MB(
            self.xsize, self.ysize)

        # min, max, mean, stddev
        self.statistics = np.zeros((self.nbands, 4))
        for i in range(self.nbands):
            self.statistics[i] = self.ds.GetRasterBand(i + 1).GetStatistics(
                0, 1)
            #self.statistics[i] = np.array([0, 1, 0, 1])

        self.MP_pool = None

        #print('Done initializing reader...')

    def estimateTileSize_MB(self, xsize=None, ysize=None):
        '''
        '''
        if xsize is None:
            xsize = self.xsize
        if ysize is None:
            ysize = self.ysize
        return np.array([
            1.0
        ]).astype('float32').nbytes / 1024.0**2 * xsize * ysize * self.nbands

    def readWholeRaster(self, multithread=conf.MULTITHREAD_READ):
        data = None
        if multithread:

            def threadReadingByBand(i, rasterfile):
                ''' each thread reads a whole band
                    using multiprocess pool
                '''
                import gdal, gdalconst, psutil, conf
                import numpy as np
                ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly)
                data = ds.GetRasterBand(i).ReadAsArray()
                return data

            # optimal for multi-thread reading by band
            n_threads = self.nbands
            if self.MP_pool is None:
                self.MP_pool = Pool(n_threads)

            ## multi-thread reading by band
            band_idx = range(1, n_threads + 1)
            fns = np.array([self.srcRasterfile]).repeat(n_threads)
            data = self.MP_pool.map(threadReadingByBand, band_idx, fns)
            data = np.stack(data, axis=0)
            self.MP_pool.clear()
        else:
            data = self.ds.ReadAsArray(xoff=0, yoff=0, xsize=None, ysize=None)

        ## nodatavalues
        #if self.nodata < 0:
        #    data[data < self.nodata] = self.nodata
        return data

    def readNextTile(self,
                     xsize=None,
                     ysize=None,
                     multithread=conf.MULTITHREAD_READ):
        ## update xsize and ysize if needed
        ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile)
        if xsize is not None: self.xsize = xsize
        if ysize is not None: self.ysize = ysize

        N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize))
        y = int(self.__N_TilesRead / N_BLOCK_X)
        x = self.__N_TilesRead - y * N_BLOCK_X

        self.xoff = min(x * self.xsize, self.ncols)
        xsize = min(self.xsize, self.ncols - self.xoff)

        self.yoff = min(y * self.ysize, self.nrows)
        ysize = min(self.ysize, self.nrows - self.yoff)

        if self.xoff == self.ncols or self.yoff == self.nrows:
            return (None, self.xoff, self.yoff, 0, 0)

        data = None
        if multithread:  ## multi-thread read

            def threadReadingByBand(i, param, rasterfile):
                ''' each thread reads a band, with tile dimension spec in param
                    using multiprocess pool
                '''
                import gdal, gdalconst
                import numpy as np
                ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly)
                data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0],
                                                       yoff=param[1],
                                                       win_xsize=param[2],
                                                       win_ysize=param[3])
                return data

            # optimal for multi-thread reading by band
            n_threads = self.nbands  # - 1
            if self.MP_pool is None:
                self.MP_pool = Pool(n_threads)

            ## multi-thread reading by band
            params = []
            for i in range(n_threads):
                params.append([self.xoff, self.yoff, xsize, ysize])
            fns = np.array([self.srcRasterfile]).repeat(n_threads)
            band_idx = range(1, self.nbands + 1)
            data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns)
            data = np.stack(data, axis=0)
            self.MP_pool.clear()

        else:  ## single-thread read
            data = self.ds.ReadAsArray(xoff=self.xoff,
                                       yoff=self.yoff,
                                       xsize=xsize,
                                       ysize=ysize)
        ## nodatavalues
        #if self.nodata < 0:
        #    data[data < self.nodata] = self.nodata

        self.__N_TilesRead += 1

        return (data, self.xoff, self.yoff, xsize, ysize)

    def setNTilesRead(self, N):
        self.__N_TilesRead = N

    def readNextTileOverlap(self,
                            xsize=None,
                            ysize=None,
                            overlap=2,
                            multithread=conf.MULTITHREAD_READ):
        ## update xsize and ysize if needed
        ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile)
        if xsize is not None: self.xsize = xsize
        if ysize is not None: self.ysize = ysize

        N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize))
        y = int(self.__N_TilesRead / N_BLOCK_X)
        x = self.__N_TilesRead - y * N_BLOCK_X

        self.xoff = min(x * self.xsize, self.ncols)
        xsize = min(self.xsize, self.ncols - self.xoff)

        self.yoff = min(y * self.ysize, self.nrows)
        ysize = min(self.ysize, self.nrows - self.yoff)

        if self.xoff == self.ncols or self.yoff == self.nrows:
            return (None, self.xoff, self.yoff, 0, 0, -1, -1)

        data = None
        if multithread:  ## multi-thread read

            def threadReadingByBand(i, param, rasterfile):
                ''' each thread reads a band, with tile dimension spec in param
                    using multiprocess pool
                '''
                import gdal, gdalconst
                import numpy as np
                ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly)
                data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0],
                                                       yoff=param[1],
                                                       win_xsize=param[2],
                                                       win_ysize=param[3])
                return data

            # optimal for multi-thread reading by band
            n_threads = self.nbands  # - 1
            if self.MP_pool is None:
                self.MP_pool = Pool(n_threads)

            ## multi-thread reading by band
            params = []
            for i in range(n_threads):

                _xoff = max(0, self.xoff - overlap)
                _yoff = max(0, self.yoff - overlap)

                if _xoff == 0:
                    _xsize = min(xsize + overlap, self.ncols - self.xoff)
                else:
                    _xsize = min(xsize + 2 * overlap, self.ncols - self.xoff)

                if _yoff == 0:
                    _ysize = min(ysize + overlap, self.nrows - self.yoff)
                else:
                    _ysize = min(ysize + 2 * overlap, self.nrows - self.yoff)
                params.append([_xoff, _yoff, _xsize, _ysize])

                #params.append([self.xoff, self.yoff, xsize, ysize])
            fns = np.array([self.srcRasterfile]).repeat(n_threads)
            band_idx = range(1, self.nbands + 1)
            data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns)
            data = np.stack(data, axis=0)
            self.MP_pool.clear()

        else:  ## single-thread read
            _xoff = max(0, self.xoff - overlap)
            _yoff = max(0, self.yoff - overlap)

            if _xoff == 0:
                _xsize = min(xsize + overlap, self.ncols - self.xoff)
            else:
                _xsize = min(xsize + 2 * overlap, self.ncols - self.xoff)

            if _yoff == 0:
                _ysize = min(ysize + overlap, self.nrows - self.yoff)
            else:
                _ysize = min(ysize + 2 * overlap, self.nrows - self.yoff)

            #print('inside', self.xoff, self.yoff, self.xsize, self.ysize)
            #print('inside', _xoff, _yoff, _xsize, _ysize)
            data = self.ds.ReadAsArray(xoff=_xoff,
                                       yoff=_yoff,
                                       xsize=_xsize,
                                       ysize=_ysize)

            #data = self.ds.ReadAsArray(xoff=self.xoff, yoff=self.yoff, xsize=xsize, ysize=ysize)

        ## nodatavalues
        #if self.nodata < 0:
        #    data[data < self.nodata] = self.nodata

        self.__N_TilesRead += 1

        #return (data, _xoff, _yoff, _xsize, _ysize)
        return (data, self.xoff, self.yoff, xsize, ysize, _xoff, _yoff)

    def reset(self):
        ''' reset after reading tiles
        '''
        self.xoff, self.yoff = 0, 0
        self.__N_TilesRead = 0

    def extractByXY(self, x, y, xsize=1, ysize=1):
        ''' Extract raster value by x, y coordinates
        '''
        xoff = int((x - self.geotransform[0]) / self.geotransform[1])
        yoff = int((y - self.geotransform[3]) / self.geotransform[5])
        return self.ds.ReadAsArray(xoff, yoff, xsize, ysize)

    def extractByNbrhd(self, centerX, centerY, nbrXsize=1, nbrYsize=1):
        ''' Extract raster value by x, y coordinates
        '''
        xoff = int((x - self.geotransform[0]) / self.geotransform[1])
        yoff = int((y - self.geotransform[3]) / self.geotransform[5])
        return self.ds.ReadAsArray(xoff - int(nbrXsize / 2),
                                   yoff - int(nbrYsize / 2), nbrXsize,
                                   nbrYsize)

    def extractByNbrhd_batch(self, centerXs, centerYs, nbrXsize=1, nbrYsize=1):
        ''' Extract raster value by x, y coordinates
        '''
        xoffs = ((centerXs - self.geotransform[0]) /
                 self.geotransform[1]).astype(int) - int(nbrXsize / 2)
        yoffs = ((centerYs - self.geotransform[3]) /
                 self.geotransform[5]).astype(int) - int(nbrYsize / 2)

        data = None
        for xoff, yoff in zip(xoffs, yoffs):
            #print('Extracting NBRHD (%d, %d)' % (xoff, yoff))
            tmp = self.ds.ReadAsArray(xoff.item(), yoff.item(), nbrXsize,
                                      nbrYsize)

            #print(tmp.shape)
            tmp = np.expand_dims(tmp, axis=0)
            #print(tmp.shape)
            if data is None:
                data = tmp
            else:
                data = np.concatenate((data, tmp), axis=0)
            #print('data.shape:', data.shape)
        #print(data.shape)
        return data

    def extractByRC(c, r, xsize=1, ysize=1):
        '''Extract raster value by row, col
        '''
        return self.ds.ReadAsArray(c, r, xsize, ysize)

    def close(self):
        self.ds = None
        if self.MP_pool is not None:
            self.MP_pool.clear()
コード例 #17
0
class RPKI_Validator_Wrapper:
    """This class gets validity data from ripe"""

    __slots__ = ['total_prefix_origin_pairs', "_process", "_table_input",
                 "_rpki_file"]

    # Sorry for the crazy naming scheme, must be done to avoid
    # having install file names in multiple locations
    temp_install_path = "/tmp/temp_rpki_validator_install"
    rpki_package_path = RPKI_PACKAGE_PATH
    rpki_run_name = RPKI_RUN_NAME
    rpki_run_path = RPKI_PACKAGE_PATH + RPKI_RUN_NAME
    rpki_db_paths = [RPKI_PACKAGE_PATH + x for x in ["db/", "rsync/"]]
    port = 8080
    api_url = "http://[::1]:8080/api/"

    def __init__(self, **kwargs):
        config_logging(kwargs.get("stream_level", logging.INFO),
                       kwargs.get("section"))
        self._table_input = kwargs.get("table_input", "mrt_rpki")
        if not os.path.exists(self.rpki_package_path):
            logging.warning("Looks like validator is not installed")
            logging.warning("Installing validator now")
            RPKI_Validator_Wrapper.install(**kwargs)

#################################
### Context Manager Functions ###
#################################

    def __enter__(self):
        """Runs the RPKI Validator"""

        utils.kill_port(self.port)
        # Must remove these to ensure a clean run
        utils.clean_paths(self.rpki_db_paths)
        cmds = [f"cd {self.rpki_package_path}",
                f"chown -R root:root {self.rpki_package_path}"]
        utils.run_cmds(cmds)
        # Writes validator file and serves it
        # Can't use cntext manager here since it returns it
        self._rpki_file = RPKI_File(self._table_input)
        self._rpki_file.spawn_process()
        self._process = ProcessingPool()
        self._process.apipe(self._start_validator)
        self.total_prefix_origin_pairs = self._rpki_file.total_lines
        return self

    def __exit__(self, type, value, traceback):
        """Closes RPKI Validator"""

        self._process.close()
        self._process.terminate()
        self._process.join()
        self._process.clear()
        utils.kill_port(self.port, wait=False)
        logging.debug("Closed rpki validator")
        self._rpki_file.close()

    def _start_validator(self):
        """Sends start cmd to RPKI Validator"""

        logging.info("Starting RPKI Validator")
        utils.run_cmds((f"cd {self.rpki_package_path} && "
                        f"./{self.rpki_run_name}"))

#########################
### Wrapper Functions ###
#########################

    def load_trust_anchors(self):
        """Loads all trust anchors"""

        utils.write_to_stdout(f"{datetime.now()}: Loading RPKI Validator\n",
                              logging.root.level)
        time.sleep(60)
        while self._get_validation_status() is False:
            time.sleep(10)
            utils.write_to_stdout(".", logging.root.level)
        utils.write_to_stdout("\n", logging.root.level)
        self._wait(30, "Waiting for upload to bgp preview")

    def make_query(self, api_endpoint: str, data=True) -> dict:
        """Makes query to api of rpki validator"""

        result = utils.get_json(os.path.join(self.api_url, api_endpoint),
                                RPKI_Validator_Wrapper.get_headers())
        return result["data"] if data else result

    def get_validity_data(self) -> dict:
        """Gets the data from ripe and formats it for csv insertions"""

        logging.info("Getting data from ripe")
        assert self.total_prefix_origin_pairs < 10000000, "page size too small"
        # Then we get the data from the ripe RPKI validator
        # Todo for later, change 10mil to be total count
        return self.make_query("bgp/?pageSize=10000000")

########################
### Helper Functions ###
########################

    def _wait(self, time_to_sleep: int, msg: str):
        """logs a message and waits"""

        logging.debug(msg)
        if logging.root.level == logging.INFO:
            # Number of times per second to update tqdm
            divisor = 100
            for _ in trange(time_to_sleep * divisor,
                            desc=msg):
                time.sleep(1 / divisor)

    def _get_validation_status(self) -> bool:
        """Returns row count of json object for waiting"""

        try:
            for x in self.make_query("trust-anchors/statuses"):
                if x["completedValidation"] is False:
                    # If anything has not been validated return false
                    return False
            # All are validated. Return true
            return True
        except urllib.error.URLError as e:
            self._wait(60, "Connection was refused")
            return False

######################
### Static methods ###
######################

    @staticmethod
    def get_validity_dict() -> dict:
        """Returns the validity dict for the RPKI Validator to decode results

        I could have this as a class attribute but too messy I think.
        """

        return {"VALID": ROA_Validity.VALID.value,
                "UNKNOWN": ROA_Validity.UNKNOWN.value,
                "INVALID_LENGTH": ROA_Validity.INVALID_BY_LENGTH.value,
                "INVALID_ASN": ROA_Validity.INVALID_BY_ORIGIN.value}

    @staticmethod
    def get_headers() -> dict:
        """Gets the headers for all url queries to the validator"""

        return {"Connection": "keep-alive",
                "Cache-Control": "max-age=0",
                "Upgrade-Insecure-Requests": 1,
                "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64)"
                               " AppleWebKit/537.36 (KHTML, like Gecko) "
                               "Chrome/73.0.3683.86 Safari/537.36"),
                "Accept": ("text/html,application/xhtml+xml,"
                           "application/xml;q=0.9,image/webp,"
                           "image/apng,*/*;q=0.8,"
                           "application/signed-exchange;v=b3"),
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.9"}

#########################
### Install Functions ###
#########################

    @staticmethod
    def install(**kwargs):
        """Installs RPKI validator with our configs.

        This might break in the future, but we need to do it this way
        for now to be able to do what we want with our own prefix origin
        table.
        """

        config_logging(kwargs.get("stream_level", logging.DEBUG),
                       kwargs.get("section"))
        utils.delete_paths([RPKI_Validator_Wrapper.rpki_package_path,
                            RPKI_Validator_Wrapper.temp_install_path])

        RPKI_Validator_Wrapper._download_validator()
        RPKI_Validator_Wrapper._change_file_hosted_location()
        path = RPKI_Validator_Wrapper._change_server_address()
        RPKI_Validator_Wrapper._config_absolute_paths(path)

    @staticmethod
    def _download_validator():
        """Downloads validator into proper location"""

        rpki_url = ("https://ftp.ripe.net/tools/rpki/validator3/beta/generic/"
                    "rpki-validator-3-latest-dist.tar.gz")
        arin_tal = ("https://www.arin.net/resources/manage/rpki/"
                    "arin-ripevalidator.tal")
        # This is the java version they use so we will use it
        cmds = [f"mkdir {RPKI_Validator_Wrapper.temp_install_path}",
                f"cd {RPKI_Validator_Wrapper.temp_install_path}",
                "sudo apt-get -y install openjdk-8-jre",
                f"wget {rpki_url}",
                "tar -xvf rpki-validator-3-latest-dist.tar.gz",
                "rm -rf rpki-validator-3-latest-dist.tar.gz",
                f"mv rpki-validator* {RPKI_Validator_Wrapper.rpki_package_path}",
                f"cd {RPKI_Validator_Wrapper.rpki_package_path}",
                "cd preconfigured-tals",
                f"wget {arin_tal}"]
        utils.run_cmds(cmds)

    @staticmethod
    def _change_file_hosted_location():
        """Changes location of input ann for bgp preview file"""

        # Changes where the file is hosted
        path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf"
                "/application-defaults.properties")
        prepend = "rpki.validator.bgp.ris.dump.urls="
        replace = ("https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz,"
                   "https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz")
        replace_with = (f"http://localhost:{RPKI_File.port}"
                        f"/{RPKI_File.hosted_name}")
        utils.replace_line(path, prepend, replace, replace_with)

    @staticmethod
    def _change_server_address():
        """Prob because of a proxy, but on our server this is necessary"""

        # Changes the server address
        path = (f"{RPKI_Validator_Wrapper.rpki_package_path}conf"
                "/application.properties")
        prepend = "server.address="
        replace = "localhost"
        replace_with = "0.0.0.0"
        utils.replace_line(path, prepend, replace, replace_with)
        return path

    @staticmethod
    def _config_absolute_paths(path):
        """Configure rpki validator to run off absolute paths

        This is necessary due to script being called from elsewhere
        In other words not from inside the RPKI dir.
        """

        # Since I am calling the script from elsewhere these must be
        # absolute paths
        prepend = "rpki.validator.data.path="
        replace = "."
        # Must remove trailing backslash at the end
        replace_with = RPKI_Validator_Wrapper.rpki_package_path[:-1]
        utils.replace_line(path, prepend, replace, replace_with)

        prepend = "rpki.validator.preconfigured.trust.anchors.directory="
        replace = "./preconfigured-tals"
        replace_with = (f"{RPKI_Validator_Wrapper.rpki_package_path}"
                        "preconfigured-tals")
        utils.replace_line(path, prepend, replace, replace_with)

        prepend = "rpki.validator.rsync.local.storage.directory="
        replace = "./rsync"
        replace_with = f"{RPKI_Validator_Wrapper.rpki_package_path}rsync"
        utils.replace_line(path, prepend, replace, replace_with)
コード例 #18
0
    def calcQM(self, data, parallel=mult.cpu_count()):
        #Change our working directory to gauDir+name and then back at the end
        curPath = os.getcwd()
        os.chdir(self.gauDir + self.name)

        traj = data['traj']
        totT = np.shape(traj)[0]
        numNuc = len(self.molGamma)

        #runList keeps track of the timepoints that still need to be calculated
        #only add if the log file does not yet exist or is incomplete
        runList = []

        #Create the variables that we will read in from Gaussian
        shield = np.zeros([totT, numNuc, 3, 3], np.float32)  #Shield matrix
        chargesMul = np.zeros([totT, numNuc], np.float32)
        chargesESP = np.zeros([totT, numNuc], np.float32)
        potential = np.zeros([totT, numNuc], np.float32)
        field = np.zeros([totT, numNuc, 3], np.float32)  #X, Y, Z
        gradient = np.zeros([totT, numNuc, 6],
                            np.float32)  # XX, YY, ZZ, XY, XZ, YZ

        #Read in any log files that already exist
        for t in range(totT):
            logFile = self.name + '-' + str(t) + 'NMR.log'
            #If the file doesn't exist, add to runlist and move to next timepoint
            if not os.path.isfile(logFile):
                runList.append([t, traj[t, :, :]])
                continue

            #Look at log file and read in all variables
            inFile = open(logFile, 'r')
            lines = inFile.readlines()
            inFile.close()

            ##NMR properties - Find the start of the NMR calculation
            NMRLoc = -1
            for j, l in enumerate(lines):
                if l.find("SCF GIAO Magnetic shielding tensor (ppm):") > -1:
                    NMRLoc = j
                    break

            if NMRLoc > 0:
                NMRdata = lines[NMRLoc + 1:]
                for nuc in range(numNuc):
                    #Skip to the position of the nucleus of interest
                    nucData = NMRdata[5 * nuc:5 * (nuc + 1)]

                    #Get each 3 separately - nucData[0] is total chemical shift
                    #Need to split on both whitespace and = as sometimes big
                    # values don't have a whitespace.
                    line = nucData[1].replace('=', ' ').split()  #XX YX ZX
                    shield[t, nuc, 0, :] = [
                        float(line[1]),
                        float(line[3]),
                        float(line[5])
                    ]
                    line = nucData[2].replace('=', ' ').split()  #XY YY ZY
                    shield[t, nuc, 1, :] = [
                        float(line[1]),
                        float(line[3]),
                        float(line[5])
                    ]
                    line = nucData[3].replace('=', ' ').split()  #XZ YZ ZZ
                    shield[t, nuc, 2, :] = [
                        float(line[1]),
                        float(line[3]),
                        float(line[5])
                    ]

            ##Mulliken charges
            MulLoc = -1
            for j, l in enumerate(lines):
                if l.find("Mulliken charges:") > -1:
                    MulLoc = j
                    break

            if MulLoc > 0:
                ChargeData = lines[MulLoc + 2:]
                for nuc in range(numNuc):
                    chargesMul[t, nuc] = float(ChargeData[nuc].split()[2])

            ##Electrostatic properties - Find the start of the Prop calculation
            PropLoc = -1
            for j, l in enumerate(lines):
                if l.find("Electrostatic Properties (Atomic Units)") > -1:
                    PropLoc = j
                    break

            if PropLoc > 0:
                PotField = lines[PropLoc + 6:]
                for nuc in range(numNuc):
                    line = PotField[nuc].split()
                    potential[t, nuc] = float(line[2])
                    field[t, nuc, :] = [
                        float(line[3]),
                        float(line[4]),
                        float(line[5])
                    ]
                for j, l in enumerate(PotField):
                    if l.find("Electric Field Gradient") > -1:
                        GradLoc = j
                        break
                Grad = PotField[GradLoc + 3:]
                for nuc in range(numNuc):
                    line = Grad[nuc].split()
                    gradient[t, nuc, :3] = [
                        float(line[2]),
                        float(line[3]),
                        float(line[4])
                    ]
                for j, l in enumerate(Grad):
                    if l.find("Electric Field Gradient") > -1:
                        GradLoc = j
                        break
                Grad2 = Grad[GradLoc + 3:]
                for nuc in range(numNuc):
                    line = Grad2[nuc].split()
                    gradient[t, nuc, 3:] = [
                        float(line[2]),
                        float(line[3]),
                        float(line[4])
                    ]

            ##ESP Charges:
            ESPLoc = -1
            for j, l in enumerate(lines):
                if l.find(" ESP charges:") > -1:
                    ESPLoc = j
                    break

            if ESPLoc > 0:
                ChargeData = lines[ESPLoc + 2:]
                for nuc in range(numNuc):
                    chargesESP[t, nuc] = float(ChargeData[nuc].split()[2])

            #If the data is not found, rename the log file and recalculate
            else:
                print(
                    "Error! NMR/Electrostatic Calculation not found in file " +
                    logFile)
                os.rename(logFile, logFile + '-fail.log')
                runList.append([t, traj[t, :, :]])

        #There are datapoints to calculate. Do this in parallel
        if len(runList) > 0:
            #Run the remaining calcualtions in parallel
            print("Calculating data for " + str(len(runList)) + " timepoints")
            p = Pool(parallel)
            p.map(self.buildSubSystem, runList)
            p.clear()
            print("Must run calcShield again to get results")
            os.chdir(curPath)
            return

        os.chdir(curPath)
        return {
            "traj": traj,
            "shield": shield,
            "chargesMul": chargesMul,
            "chargesESP": chargesESP,
            "potential": potential,
            "field": field,
            "gradient": gradient
        }
コード例 #19
0
def build_hamiltonian_diagonal_parallel2(clustered_ham,
                                         ci_vector,
                                         nproc=None,
                                         batch_size=100):
    """
    Build hamiltonian diagonal in basis in ci_vector
    """
    # {{{
    print(" In build_hamiltonian_diagonal_parallel2. nproc=", nproc)

    global _clustered_ham
    global _delta_fock
    _clustered_ham = clustered_ham
    _delta_fock = tuple([(0, 0) for ci in range(len(clustered_ham.clusters))])

    def do_parallel_work(v_batch):
        tmpout = []
        for v_curr in v_batch:
            tmp = 0
            fockspace = v_curr[0]
            config = v_curr[1]

            terms = _clustered_ham.terms[_delta_fock]
            ## add diagonal energies

            for term in terms:
                #tmp += term.matrix_element(fockspace,config,fockspace,config)
                #tmp += term.diag_matrix_element(fockspace,config,opt_einsum=False)

                mats = []
                # state sign is always 1 here, since an even number of creation/annihilation operators can only
                # contribute to diagonal

                state_sign = 1
                n_active = 0
                for oi, o in enumerate(term.ops):
                    if o == '':
                        continue
                    n_active += 1

                if n_active == 1:
                    ci = term.active[0]
                    tmp += term.clusters[ci].ops['H'][(
                        fockspace[ci], fockspace[ci])][config[ci], config[ci]]
                elif n_active > 0:
                    for oi, o in enumerate(term.ops):
                        if o == '':
                            continue
                        try:
                            do = term.clusters[oi].ops[o]
                        except KeyError:
                            print(" Couldn't find:", term)
                            exit()
                        try:
                            d = do[(fockspace[oi],
                                    fockspace[oi])][config[oi],
                                                    config[oi]]  #D(I,J,:,:...)
                        except KeyError:
                            continue
                        mats.append(d)

                    if len(mats) < n_active:
                        continue
                    tmp += np.einsum(term.contract_string,
                                     *mats,
                                     term.ints,
                                     optimize=False)
            tmpout.append(tmp)
        print(".", end="", flush=True)
        return tmpout

    import multiprocessing as mp
    from pathos.multiprocessing import ProcessingPool as Pool

    if nproc == None:
        pool = Pool()
    else:
        pool = Pool(processes=nproc)

    print(" Using Pathos library for parallelization. Number of workers: ",
          pool.ncpus)

    # define batches
    conf_batches = []
    batch_size = min(batch_size, len(ci_vector))
    batch = []
    print(" Form batches. Max batch size: ", batch_size)
    for i, j, k in ci_vector:
        if len(batch) < batch_size:
            batch.append((i, j))
        else:
            conf_batches.append(batch)
            batch = []
            batch.append((i, j))
    if len(batch) > 0:
        conf_batches.append(batch)
        batch = []

    if len(ci_vector) == 0:
        return np.array([])

    print(" Number of configs: ", len(ci_vector))
    print(" Number of batches: ", len(conf_batches))
    print(" Batches complete : ")
    out = pool.map(do_parallel_work, conf_batches)
    print()
    pool.close()
    pool.join()
    pool.clear()

    Hdv = np.zeros((len(ci_vector), ))
    count = 0
    for oi in out:
        for oij in oi:
            Hdv[count] = oij
            count += 1

    return Hdv
コード例 #20
0
def build_hamiltonian_diagonal_parallel1(clustered_ham_in,
                                         ci_vector,
                                         nproc=None):
    """
    Build hamiltonian diagonal in basis in ci_vector
    """
    # {{{
    global clusters
    global clustered_ham
    print(" In build_hamiltonian_diagonal_parallel1. nproc=", nproc)

    clustered_ham = clustered_ham_in
    clusters = clustered_ham_in.clusters

    global delta_fock
    delta_fock = tuple([(0, 0) for ci in range(len(clusters))])

    def do_parallel_work(v_curr):
        fockspace = v_curr[0]
        config = v_curr[1]
        coeff = v_curr[2]

        terms = clustered_ham.terms[delta_fock]
        ## add diagonal energies
        tmp = 0

        for term in terms:
            #tmp += term.matrix_element(fockspace,config,fockspace,config)
            tmp += term.diag_matrix_element(fockspace,
                                            config,
                                            opt_einsum=False)
        return tmp

    import multiprocessing as mp
    from pathos.multiprocessing import ProcessingPool as Pool

    if nproc == None:
        pool = Pool()
    else:
        pool = Pool(processes=nproc)

    print(" Using Pathos library for parallelization. Number of workers: ",
          pool.ncpus)

    #chunksize = 100
    #print(" Chunksize: ", chunksize)
    #out = pool.map(do_parallel_work, ci_vector, chunksize=chunksize)
    if len(ci_vector) == 0:
        return np.array([])
    out = pool.map(do_parallel_work, ci_vector)
    pool.close()
    pool.join()
    pool.clear()

    #out = pool.map(do_parallel_work, ci_vector, batches=100)
    #out = list(map(do_parallel_work, ci_vector))

    #Hdv = np.zeros((len(ci_vector)))
    #for o in out:
    #    Hdv[o[0]] = o[1]
    Hdv = np.array(out)

    return Hdv
コード例 #21
0
def grow_hamiltonian_parallel(h_old,
                              clustered_ham,
                              ci_vector,
                              ci_vector_old,
                              iprint=1,
                              nproc=None,
                              opt_einsum=True,
                              thresh=1e-14):
    """
    Grow the Hamiltonian matrix by building only the new matrix elements for the new space indicated by ci_vector
    parallelized over matrix elements
    """
    # {{{
    print(" In grow_hamiltonian_parallel. nproc=", nproc)

    start = time.time()
    ci_vector_old.prune_empty_fock_spaces()
    ci_vector.prune_empty_fock_spaces()

    old_dim = len(ci_vector_old)
    old_basis = ci_vector_old.copy()
    new_basis = ci_vector.copy()
    full_basis = ci_vector.copy()
    old_basis.set_vector(np.array(range(len(old_basis))))
    new_basis.set_vector(np.array(range(len(new_basis))))
    full_basis.set_vector(np.array(range(len(full_basis))))
    for f, c, v in old_basis:
        del new_basis[f][c]

    new_basis.prune_empty_fock_spaces()
    print(" Size of old space:", len(old_basis))
    print(" Size of new space:", len(new_basis))
    print(" Size of all space:", len(full_basis))
    assert (len(full_basis) == len(old_basis) + len(new_basis))

    clusters = clustered_ham.clusters

    H = np.zeros((len(ci_vector), len(ci_vector)))
    n_clusters = len(clusters)

    # find locations of old basis is full basis
    t1 = time.time()
    full_inds = np.zeros((len(old_basis)), dtype=int)
    count = 0
    for f1, cs1 in old_basis.items():
        for c1, i1 in old_basis[f1].items():
            full_inds[count] = full_basis[f1][c1]
            count += 1
    for idx, i in enumerate(full_inds):
        H[i, full_inds] = h_old[idx, :]
    print(" updating matrix:", time.time() - t1, flush=True)

    t1 = time.time()
    #for f1,cs1 in old_basis.items():
    #    for c1,i1 in old_basis[f1].items():
    #        for f2,cs2 in old_basis.items():
    #            for c2,i2 in old_basis[f2].items():
    #                H[full_basis[f1][c1],full_basis[f2][c2]] = h_old[i1,i2]
    #print("t:",time.time()-t1,flush=True)

    if len(new_basis) == 0:
        return H
    for f1, c1, i1 in new_basis:
        assert (new_basis[f1][c1] == full_basis[f1][c1])
    for f1, c1, i1 in old_basis:
        old_basis[f1][c1] = full_basis[f1][c1]
        if f1 in new_basis:
            assert (c1 not in new_basis[f1])

    global _h
    global _new_basis
    #global _old_basis
    #global _full_basis
    _h = clustered_ham
    _new_basis = new_basis
    #_old_basis  = old_basis
    #_full_basis  = full_basis

    debug = 0
    if debug:
        try:
            assert (np.amax(np.abs(H - H.T)) < 1e-14)
        except AssertionError:
            for f1, c1, i1 in full_basis:
                for f2, c2, i2 in full_basis:
                    if abs(H[i1, i2] - H[i2, i1]) > 1e-14:
                        print(f1, c1, i1)
                        print(f2, c2, i2)
                        print(H[i1, i2] - H[i2, i1])
            raise AssertionError

    #def do_parallel_work(fock_l, conf_l, idx_l, basis_r):
    def do_parallel_work(inp):
        fock_l = inp[0]
        conf_l = inp[1]
        idx_l = inp[2]
        new = inp[
            3]  # which subspace is _l in? this is 0 for old and 1 for new

        out = []
        if new:
            for fock_r in _new_basis.fblocks():
                confs_r = _new_basis[fock_r]
                delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0],
                                     fock_l[ci][1] - fock_r[ci][1])
                                    for ci in range(len(_h.clusters))])
                if delta_fock in _h.terms:
                    for conf_r in confs_r:
                        idx_r = _new_basis[fock_r][conf_r]
                        if idx_l <= idx_r:
                            me = 0
                            for term in _h.terms[delta_fock]:
                                me += term.matrix_element(
                                    fock_l, conf_l, fock_r, conf_r)
                            out.append((idx_r, me))
        else:
            for fock_r in _new_basis.fblocks():
                confs_r = _new_basis[fock_r]
                delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0],
                                     fock_l[ci][1] - fock_r[ci][1])
                                    for ci in range(len(_h.clusters))])
                if delta_fock in _h.terms:
                    for conf_r in confs_r:
                        idx_r = _new_basis[fock_r][conf_r]
                        me = 0
                        for term in _h.terms[delta_fock]:
                            me += term.matrix_element(fock_l, conf_l, fock_r,
                                                      conf_r)
                        out.append((idx_r, me))
        print(".", end='', flush=True)
        return ([idx_l, out])

    import multiprocessing as mp
    from pathos.multiprocessing import ProcessingPool as Pool
    if nproc == None:
        pool = Pool()
    else:
        pool = Pool(processes=nproc)

    jobs = [(i[0], i[1], i[2], 1) for i in new_basis]
    jobs.extend([(i[0], i[1], i[2], 0) for i in old_basis])

    stop = time.time()
    print(" Time spent finding new subspace:", stop - start)
    start = time.time()
    print(" Number of jobs to do:", len(jobs), flush=True)
    results = pool.map(do_parallel_work, jobs)
    print("")
    stop = time.time()
    print(" Time spent building new subspace:", stop - start)

    pool.close()
    pool.join()
    pool.clear()

    for result in results:
        row_idx = result[0]
        row = result[1]
        for col in row:
            col_idx = col[0]
            term = col[1]
            #assert( col_idx >= row_idx)
            assert (abs(H[row_idx, col_idx]) < 1e-16)
            assert (abs(H[col_idx, row_idx]) < 1e-16)
            H[row_idx, col_idx] = term
            H[col_idx, row_idx] = term

    return H
コード例 #22
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--logdir', '-dir', type=str, default='data')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    parser.add_argument('--dont_normalize_advantages',
                        '-dna',
                        action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=1)
    parser.add_argument('--size', '-s', type=int, default=32)
    parser.add_argument('--gae', '-gae', action='store_true')
    parser.add_argument('--lambd', '-ld', type=float, default=1.0)
    parser.add_argument('--threads', '-th', type=int, default=1)
    parser.add_argument('--max_threads_pool', '-max_tp', type=int, default=16)
    parser.add_argument('--thread_timeout', '-th_to', type=int, default=None)
    parser.add_argument('--offpol', '-ofp', action='store_true')
    parser.add_argument('--n_iter_pol', '-np', type=int, default=1)
    parser.add_argument('--n_iter_pol_sched',
                        '-nps',
                        type=str,
                        default='const',
                        choices=['const', 'exp_dec'])
    parser.add_argument('--n_iter_pol_exp_base',
                        '-npexpb',
                        type=int,
                        default=5)
    parser.add_argument('--n_iter_pol_exp_decay',
                        '-npexpd',
                        type=float,
                        default=0.95)
    parser.add_argument('--weight_importance_samp',
                        '-wis',
                        action='store_true')
    parser.add_argument('--record', '-rec', type=int, default=None)
    args = parser.parse_args()

    it_pol_fn = None
    if args.offpol:
        if args.n_iter_pol_sched == 'exp_dec':
            it_pol_fn = lambda it: \
                int(np.ceil(args.n_iter_pol * pow(args.n_iter_pol_exp_decay,it / args.n_iter_pol_exp_base)))

    if not (os.path.exists(args.logdir)):
        os.makedirs(args.logdir)
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join(args.logdir, logdir)
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None
    start = time.time()

    for e in range(args.n_experiments):
        seed = args.seed + 10 * e
        print('Running experiment with seed %d' % seed)

        def train_func():
            train_PG(exp_name=args.exp_name,
                     env_name=args.env_name,
                     n_iter=args.n_iter,
                     gamma=args.discount,
                     min_timesteps_per_batch=args.batch_size,
                     max_path_length=max_path_length,
                     learning_rate=args.learning_rate,
                     reward_to_go=args.reward_to_go,
                     animate=args.render,
                     logdir=os.path.join(logdir, '%d' % seed),
                     normalize_advantages=not (args.dont_normalize_advantages),
                     nn_baseline=args.nn_baseline,
                     seed=seed,
                     n_layers=args.n_layers,
                     size=args.size,
                     gae=args.gae,
                     lambd=args.lambd,
                     threads=args.threads,
                     max_threads_pool=args.max_threads_pool,
                     thread_timeout=args.thread_timeout,
                     offpol=args.offpol,
                     n_it_pol=args.n_iter_pol,
                     n_it_pol_fn=it_pol_fn,
                     wis=args.weight_importance_samp,
                     record=args.record)

        # Awkward hacky process runs, because Tensorflow does not like
        # repeatedly calling train_PG in the same thread.
        # p = Process(target=train_func)
        # p.start()
        # p.join()
        p = ProcessingPool(1)
        p.apipe(train_func).get()
        p.clear()
    print('All training took: {:.3f}s'.format(time.time() - start))
コード例 #23
0
ファイル: gdalwrapper.py プロジェクト: Guiming/PyCLiPSM
class tiledRasterReader:
    '''
    '''
    def __init__(self, srcRasterfile, xoff=0, yoff=0, xsize=None, ysize=None):
        '''
        '''
        self.srcRasterfile = srcRasterfile
        gdal.SetCacheMax(2**30)  # 1 GB
        self.ds = gdal.Open(self.srcRasterfile, gdalconst.GA_ReadOnly)
        self.fileList = self.ds.GetFileList()[1:]
        self.measurement_level_ints = []
        for fn in self.fileList:
            # default level of measurement
            msrlevel = conf.MSR_LEVEL_RATIO
            for keyword in conf.NOMINAL_KEYWORD_IN_FN:
                if keyword in fn:
                    msrlevel = conf.MSR_LEVEL_NOMINAL
                    break
            for key in conf.MSR_LEVELS:
                if conf.MSR_LEVELS[key] == msrlevel:
                    self.measurement_level_ints.append(int(key))
                    break
        self.measurement_level_ints = np.array(self.measurement_level_ints)

        self.nbands = self.ds.RasterCount
        self.nrows = self.ds.RasterYSize
        self.ncols = self.ds.RasterXSize
        self.geotransfrom = self.ds.GetGeoTransform()
        self.projection = self.ds.GetProjection()

        band = self.ds.GetRasterBand(1)
        self.nodata = band.GetNoDataValue()

        self.block_ysize_base = band.GetBlockSize()[0]
        self.block_xsize_base = gdal.Open(
            self.fileList[0],
            gdalconst.GA_ReadOnly).GetRasterBand(1).GetBlockSize()[0]

        self.__N_TilesRead = 0
        self.xoff, self.yoff = xoff, yoff

        if xsize is None:
            self.xsize = self.block_xsize_base
        elif xsize > self.ncols:
            print 'tile xsize exceeds RasterXsize', self.ncols
            sys.exit(1)
        else:
            self.xsize = xsize

        if ysize is None:
            self.ysize = self.block_ysize_base
        elif ysize > self.nrows:
            print 'tile xsize exceeds RasterYsize', self.nrows
            sys.exit(1)
        else:
            self.ysize = ysize

        ## estimated data size (in MB)
        self.estimate_TotalSize_MB = self.estimateTileSize_MB(
            self.nrows, self.ncols)
        self.estimate_TileSize_MB = self.estimateTileSize_MB(
            self.xsize, self.ysize)

        self.statistics = np.zeros((self.nbands, 4))
        for i in range(self.nbands):
            self.statistics[i] = self.ds.GetRasterBand(i + 1).GetStatistics(
                0, 1)

        self.MP_pool = None

    def estimateTileSize_MB(self, xsize=None, ysize=None):
        '''
        '''
        if xsize is None:
            xsize = self.xsize
        if ysize is None:
            ysize = self.ysize
        return np.array([
            1.0
        ]).astype('float32').nbytes / 1024.0**2 * xsize * ysize * self.nbands

    def readWholeRaster(self, multithread=conf.MULTITHREAD_READ):
        data = None
        if multithread:

            def threadReadingByBand(i, rasterfile):
                ''' each thread reads a whole band
                    using multiprocess pool
                '''
                import gdal, gdalconst, psutil, conf
                import numpy as np
                ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly)
                data = ds.GetRasterBand(i).ReadAsArray()
                return data

            # optimal for multi-thread reading by band
            n_threads = self.nbands
            if self.MP_pool is None:
                self.MP_pool = Pool(n_threads)

            ## multi-thread reading by band
            band_idx = range(1, n_threads + 1)
            fns = np.array([self.srcRasterfile]).repeat(n_threads)
            data = self.MP_pool.map(threadReadingByBand, band_idx, fns)
            data = np.stack(data, axis=0)
            self.MP_pool.clear()
        else:
            data = self.ds.ReadAsArray(xoff=0, yoff=0, xsize=None, ysize=None)

        ## nodatavalues
        data[data < self.nodata] = self.nodata
        return data

    def readNextTile(self,
                     xsize=None,
                     ysize=None,
                     multithread=conf.MULTITHREAD_READ):
        ## update xsize and ysize if needed
        ## PLEASE specify xsize, ysize ONLY ONCE (when reading the first tile)
        if xsize is not None: self.xsize = xsize
        if ysize is not None: self.ysize = ysize

        N_BLOCK_X = int(math.ceil(self.ncols * 1.0 / self.xsize))
        y = int(self.__N_TilesRead / N_BLOCK_X)
        x = self.__N_TilesRead - y * N_BLOCK_X

        self.xoff = min(x * self.xsize, self.ncols)
        xsize = min(self.xsize, self.ncols - self.xoff)

        self.yoff = min(y * self.ysize, self.nrows)
        ysize = min(self.ysize, self.nrows - self.yoff)

        if self.xoff == self.ncols or self.yoff == self.nrows:
            return (None, self.xoff, self.yoff, 0, 0)

        data = None
        if multithread:  ## multi-thread read

            def threadReadingByBand(i, param, rasterfile):
                ''' each thread reads a band, with tile dimension spec in param
                    using multiprocess pool
                '''
                import gdal, gdalconst
                import numpy as np
                ds = gdal.Open(rasterfile, gdalconst.GA_ReadOnly)
                data = ds.GetRasterBand(i).ReadAsArray(xoff=param[0],
                                                       yoff=param[1],
                                                       win_xsize=param[2],
                                                       win_ysize=param[3])
                return data

            # optimal for multi-thread reading by band
            n_threads = self.nbands  # - 1
            if self.MP_pool is None:
                self.MP_pool = Pool(n_threads)

            ## multi-thread reading by band
            params = []
            for i in range(n_threads):
                params.append([self.xoff, self.yoff, xsize, ysize])
            fns = np.array([self.srcRasterfile]).repeat(n_threads)
            band_idx = range(1, self.nbands + 1)
            data = self.MP_pool.map(threadReadingByBand, band_idx, params, fns)
            data = np.stack(data, axis=0)
            self.MP_pool.clear()

        else:  ## single-thread read
            data = self.ds.ReadAsArray(xoff=self.xoff,
                                       yoff=self.yoff,
                                       xsize=xsize,
                                       ysize=ysize)
        ## nodatavalues
        data[data < self.nodata] = self.nodata

        self.__N_TilesRead += 1

        return (data, self.xoff, self.yoff, xsize, ysize)

    def reset(self):
        ''' reset after reading tiles
        '''
        self.xoff, self.yoff = 0, 0
        self.__N_TilesRead = 0

    def extractByXY(self, x, y):
        ''' Extract raster value by x, y coordinates
        '''
        xoff = int((x - self.geotransfrom[0]) / self.geotransfrom[1])
        yoff = int((y - self.geotransfrom[3]) / self.geotransfrom[5])
        return self.ds.ReadAsArray(xoff, yoff, 1, 1)

    def extractByRC(c, r):
        '''Extract raster value by row, col
        '''
        return self.ds.ReadAsArray(c, r, 1, 1)

    def close(self):
        self.ds = None
        if self.MP_pool is not None:
            self.MP_pool.clear()
def compute_DTW_to_each_drone(drones_df_ls,
                              win_size,
                              no_sensors_cols,
                              per_series=False,
                              process_gps=True,
                              use_scaler=True):
    print('Start compute DTW')

    dataset = pd.concat(drones_df_ls)
    dataset = dataset.sort_values(['iter', 'update_step',
                                   'drone']).reset_index(drop=True)
    drones = dataset.drone.unique()
    numOfDrones = len(drones)

    start = time.time()
    # iter = '0simple'
    # dataset_iteri = dataset.loc[dataset['iter'] == iter, :]
    iters = dataset.iter.unique()

    # create empty df for results

    # itearte over iterartions
    def compute_DTW_on_iter(dataset,
                            iter,
                            numOfDrones,
                            drones,
                            per_series=True):
        print('iter: ', iter)
        dtw_results_dict = {
            'iter': [],
            'update_step': [],
            'drone': [],
            'comparison_drone': [],
            'DTW_dist': []
        }
        # print('iter: ',iter )
        dataset_iter = dataset.loc[dataset['iter'] == iter, :]
        # cut the df by current update step-win size
        update_step_ls = dataset_iter.update_step.unique()
        # num of features (all columns - no sensor columns and label
        num_of_features = dataset_iter.shape[1] - len(no_sensors_cols +
                                                      ['label'])
        # iterate over time steps
        for update_step in update_step_ls:
            current_seq = dataset_iter.loc[
                (dataset_iter['update_step'] <= update_step)
                & (dataset_iter['update_step'] > (update_step - win_size))]
            # iterte over drones
            for droneIidx in range(numOfDrones):
                currentDrone = drones[droneIidx]
                currentDroneDf = current_seq.loc[current_seq.drone ==
                                                 currentDrone, :]
                # drop irrelevant cols and convert to numpy
                currentDroneNp = currentDroneDf.drop(
                    no_sensors_cols + ['label'], 1).to_numpy()
                if use_scaler:
                    scaled_currentDroneNp = StandardScaler().fit_transform(
                        currentDroneNp)
                else:
                    scaled_currentDroneNp = currentDroneNp
                for droneJidx in range(numOfDrones):
                    # dont compare drone to itself
                    if (droneIidx >= droneJidx): continue
                    # print(droneIidx, droneJidx)
                    otherDrone = drones[droneJidx]
                    otherDroneDf = current_seq.loc[current_seq.drone ==
                                                   otherDrone, :]
                    otherDroneNp = otherDroneDf.drop(
                        no_sensors_cols + ['label'], 1).to_numpy()
                    if use_scaler:
                        scaled_otherDroneNp = StandardScaler().fit_transform(
                            otherDroneNp)
                    else:
                        scaled_otherDroneNp = otherDroneNp
                    """compute DTW"""

                    if per_series:  # compute between each pair of series, return list

                        dist = [
                            dtw_path(scaled_currentDroneNp[:, i],
                                     scaled_otherDroneNp[:, i])[1]
                            for i in range(num_of_features)
                        ]
                        dist = np.array(dist)
                    else:
                        # path, dist = dtw_path(scaled_currentDroneNp, scaled_otherDroneNp)
                        path = ''
                        dist = dtw(scaled_currentDroneNp,
                                   scaled_otherDroneNp,
                                   window_type="sakoechiba",
                                   window_args={
                                       'window_size': 60
                                   }).distance
                    # print('Iter {} updatestep {} DroneI {} DroneJ {} DTW {}'.format(iter,update_step,currentDrone, otherDrone, dist))
                    # save results of current drone
                    dtw_results_dict['iter'].append(iter)
                    dtw_results_dict['update_step'].append(update_step)
                    dtw_results_dict['drone'].append(currentDrone)
                    dtw_results_dict['comparison_drone'].append(otherDrone)
                    dtw_results_dict['DTW_dist'].append(
                        dist)  # ; dtw_results_dict['DTW_path'].append(path)
                    # save results of other drone
                    dtw_results_dict['iter'].append(iter)
                    dtw_results_dict['update_step'].append(update_step)
                    dtw_results_dict['drone'].append(otherDrone)
                    dtw_results_dict['comparison_drone'].append(currentDrone)
                    dtw_results_dict['DTW_dist'].append(
                        dist)  # ; dtw_results_dict['DTW_path'].append(path)

        print('iter done: ', iter)
        return dtw_results_dict

    workers = multiprocessing.cpu_count()
    print('Number of workers: ', workers)
    workers = np.min([workers, len(iters)])
    pool = ProcessingPool(workers)
    list_of_iters_dict = list(
        pool.map(
            lambda iter: compute_DTW_on_iter(dataset, iter, numOfDrones,
                                             drones, per_series), iters))
    pool.close()
    pool.join()
    pool.terminate()
    pool.clear()
    # from list of dicts to one dict
    dtw_results_dict = {
        'iter': [],
        'update_step': [],
        'drone': [],
        'comparison_drone': [],
        'DTW_dist': []
    }
    [
        dtw_results_dict[result_key].append(value)
        for dict in list_of_iters_dict for result_key, list in dict.items()
        for value in list
    ]

    print('time took: ', time.time() - start)

    dtw_results_df = pd.DataFrame.from_dict(dtw_results_dict)
    dtw_results_df = dtw_results_df.sort_values(
        ['iter', 'update_step', 'drone']).reset_index(drop=True)

    dtw_results_df_after_removal_ls = []

    return dtw_results_df
コード例 #25
0
import numpy as np
import pandas as pd
import multiprocessing as mp
from pathos.multiprocessing import ProcessingPool as Pool

df = pd.DataFrame(np.random.randint(3, 10, size=[500, 2]))


def func(df):
    return df.shape


#cores=mp.cpu_count()
cores = 8

df_split = np.array_split(df, cores, axis=0)

# create the multiprocessing pool
pool = Pool(cores)

# process the DataFrame by mapping function to each df across the pool
df_out = np.vstack(pool.map(func, df_split))

# close down the pool and join
pool.close()
pool.join()
pool.clear()
コード例 #26
0
    def __simLocs2Samples(self, X, parallel = True, nprocess = conf.N_PROCESS):
        ''' compute similarity between locations to predict and samples
            return: a matrix of similarity values, each row is a location, each column is a sample
        '''
        ## this import is necessary [on Windows]:
        # http://stackoverflow.com/questions/28445373/python-import-numpy-as-np-from-outer-code-gets-lost-within-my-own-user-defined
        import numpy as np
        import raster, points, util, conf
        def simLoc2SamplesV0(loc_ev, datapkg): # this function is needed for parallel computing using multiprocessing
            import conf
            # unpack data in datapkg
            t0 = time.time()
            sample_evs = datapkg[0]
            evs = datapkg[1]
            SD_evs = datapkg[2]
            conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(time.time()-t0)
            # number of environmental variables
            M = SD_evs.size
            # number of samples
            N = np.shape(sample_evs)[0]
            sim = np.zeros(N)
            t0 = time.time()
            for i in range(N): # for each sample
                sim0 = np.zeros(M)
                sample_ev = sample_evs[i]
                for j in range(M): # for each environmental variable
                    evi = loc_ev[j]
                    evj= sample_ev[j]
                    msrlevel = self.__envrasters[j].getMsrLevel()
                    if msrlevel == conf.MSR_LEVEL_NOMINAL or msrlevel == conf.MSR_LEVEL_ORDINAL:
                        if evi == evj:
                            sim_i = 1.0
                        else:
                            sim_i = 0.0
                    else:
                        SD_ev = SD_evs[j]
                        ev = evs[:,j]
                        SD_evj = np.sqrt(np.mean((ev - evj) ** 2))
                        sim_i = np.exp(-0.5 * (evi - evj) ** 2 / (SD_ev ** 2 / SD_evj) ** 2)
                    sim0[j] = sim_i
                sim[i] = np.min(sim0) ## limiting factor
            conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0)
            return sim

        def simLoc2Samples(loc_ev, datapkg): # this function is needed for parallel computing using multiprocessing
            import conf ## IMPORTANT - makes **conf.MSR_LEVELS** visible
            # unpack data in datapkg
            t0 = time.time()
            sample_evs = datapkg[0]
            REVS = datapkg[1]
            SD_evs = datapkg[2]
            AVG_evs = datapkg[3]
            SUM_DIF_SQ_AVG = datapkg[4]
            # Guiming 3/31/2019
            MSRLEVES = datapkg[5]
            conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(time.time()-t0)
            # number of environmental variables
            M = SD_evs.size
            # number of samples
            N = np.shape(sample_evs)[0]

            sim = np.zeros(N)
            t0 = time.time()
            for i in range(N): # for each sample
                sim0 = np.zeros(M)
                sample_ev = sample_evs[i]

                for j in range(M): # for each environmental variable
                    evi = loc_ev[j]
                    evj= sample_ev[j]
                    # Guiming 3/31/2019 - SAVES MEM, NO NEED TO DISPATCH self.__envrasters TO EACH THREAD
                    msrlevel = MSRLEVES[j]
                    ## this line below does not work without ** import conf ** at the begining of this function
                    if msrlevel == conf.MSR_LEVEL_NOMINAL or msrlevel == conf.MSR_LEVEL_ORDINAL:
                    #if msrlevel == 'nominal' or msrlevel == 'ordinal':
                        if evi == evj:
                            sim_i = 1.0
                        else:
                            sim_i = 0.0
                    else:
                        SD_ev = SD_evs[j]
                        delta = sample_ev[j] - AVG_evs[j]
                        tmp = SUM_DIF_SQ_AVG[j] + REVS * delta**2
                        SD_evj = np.sqrt(tmp/REVS)
                        sim_i = np.exp(-0.5 * (evi - evj) ** 2 / (SD_ev ** 2 / SD_evj) ** 2)

                    sim0[j] = sim_i
                sim[i] = np.min(sim0) ## limiting factor
            conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0)
            return sim

        try:
            # do dimension match check here
            if np.shape(X)[1] != len(self.__envrasters):
                print 'dimension mismatch in computing similarity in iPSM'
                sys.exit(1)

            msr_levels = []
            if conf.NAIVE:
                evs = np.zeros((self.__envrasters[0].getData().size, len(self.__envrasters)))
            SD_evs = np.zeros(len(self.__envrasters))
            AVG_evs = np.zeros(len(self.__envrasters))
            for i in range(len(self.__envrasters)):
                if conf.NAIVE:
                    evs[:, i] = self.__envrasters[i].getData().T
                msr_levels.append(self.__envrasters[i].getMsrLevel())
                SD_evs[i] = self.__envrasters[i].std
                AVG_evs[i] = self.__envrasters[i].mean

            NROWS = np.shape(X)[0]

            REVS = self.__envrasters[0].getData().size
            SUM_DIF_SQ_AVG = REVS * SD_evs**2

            samples_evs = util.extractCovariatesAtPoints(self.__envrasters, self.__soilsamples)
            samples_evs = np.array(samples_evs).T

            if not parallel:
                sim = np.zeros((NROWS, self.__soilsamples.size))
                for i in range(NROWS):
                    if conf.NAIVE: ## naive implementaton
                        sim[i,:] = self.__simLoc2SamplesV0(X[i], evs, SD_evs)
                    else: ## with optimizations
                        sim[i,:] = self.__simLoc2Samples(X[i], samples_evs, REVS, SD_evs, AVG_evs, SUM_DIF_SQ_AVG)
            else:
                datapkg = []
                for i in range(NROWS):
                    if conf.NAIVE: ## naive implementaton
                        datapkg.append([samples_evs, evs, SD_evs])
                    else:
                        # Guiming 3/31/2019
                        datapkg.append([samples_evs, REVS, SD_evs, AVG_evs, SUM_DIF_SQ_AVG, msr_levels])

                #print 'n process', nprocess
                pool = Pool(nprocess)

                t0 = time.time()
                if conf.NAIVE: ## naive implementaton
                    sim = np.array(pool.map(simLoc2SamplesV0, X, datapkg))
                else:
                    sim = np.array(pool.map(simLoc2Samples, X, datapkg))
                conf.TIME_KEEPING_DICT['parts']['compute'].append(time.time()-t0)
                pool.clear()

            return sim

        except Exception as e:
            raise
コード例 #27
0
 def compute_ota_relative_coordinates( self ):
     
     Logger.get_instance().info( 'Starting the computation of relative ORF start and stop coordinates' +
                                 ' (registered in the ORFTranscriptAsso table).')
     
     # Get information related to the ORF
     # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table:
     # - Its unique ID in the database
     # - The ID of its ORF-related entry, as well as the chromosome, 
     #   start and stop positions of the ORF
     # NB: Query is performed using raw SQL statement for better efficiency
     orf_info_sql_statement = 'SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id,\
                                      ORF.chromosome, ORF.start_pos, ORF.stop_pos AS end_pos \
                               FROM ORF \
                               INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.orf_id = ORF.id'
     if ( not self.force_overwrite ):
         orf_info_sql_statement += ' WHERE ( ORFTranscriptAsso.rel_start_pos IS NULL ) \
                                           OR ( ORFTranscriptAsso.rel_stop_pos IS NULL)'
     orf_info_df = pd.read_sql( orf_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     
     # Get information related to the transcript
     # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table:
     # - Its unique ID in the database
     # - The ID of its Transcript-related entry
     # NB: All "UNKNOWN_TRANSCRIPT" entries are excluded as an official ID is needed to perform
     #     the conversion.
     # NB: Query is performed using raw SQL statement for better efficiency
     transcript_info_sql_statement = "SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.transcript_id, \
                                             Transcript.transcript_id AS tr_id \
                                      FROM Transcript \
                                      INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.transcript_id = Transcript.id \
                                      WHERE Transcript.transcript_id != '" + Constants.UNKNOWN_TRANSCRIPT + "'"        
     transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     
     # Merge information from the two data frames in order to get
     # a data frame with the following columns:
     # - id: The ORFTranscriptAsso unique ID
     # - orf_id: The ORF unique ID
     # - chromosome: The ORF chromosome name
     # - start_pos: The ORF start position
     # - end_pos: The ORF stop position
     # - transcript_id: The Transcript unique ID
     # - tr_id: The transcript official ID (e.g. Ensembl ID)
     ota_info_df = orf_info_df.merge( transcript_info_df, 
                                      on='id', 
                                      how = 'inner', 
                                      validate = 'one_to_one' )
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_ota_relative_coordinates(): ' +
                                  str( ota_info_df.shape[0] ) + ' ORFTranscriptAsso entries are' +
                                  ' expected to be processed.')
     
     # As the conversion of coordinates in R may be highly time-consuming,
     # split the data frame into small data frames and multi-process the 
     # computation
     # Split the data frame into smaller data frames that can be processed 
     # independently from each other
     subset_data_frames = [ ota_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \
                            for min_bound in xrange( 0, 
                                                     ota_info_df.shape[ 0 ], 
                                                     Constants.MAX_ENTRIES_PER_DATAFRAME ) ]
     
     # For each of the subset data frame, process it with R in order
     # to build a dataset containing the start and stop relative
     # coordinates.
     # Instantiate the list of tuple-embedded arguments necessary to
     # compute the relative coordinates
     args_to_run_r = []
     filename_prefix = self.OTA_CSV_FILE_PREFIX
     filename_suffix = 0
     for df in subset_data_frames:
         args_to_run_r.append( ( df,
                                 self.species, 
                                 self.ensembl_release_version, 
                                 filename_prefix,
                                 filename_suffix ) )
         filename_suffix += 1
             
     # Instantiate the pool of processes
     p = Pool( self.thread_nb )
     messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r )
     p.close()
     # Wait for all processes to be completed
     p.join()
     
     # Log the messages generated by the processes
     for messages in messages_to_log:
         
         ( debug_messages_to_log,
           stdout,
           stderr ) = messages
           
         for message in debug_messages_to_log:
             Logger.get_instance().debug( message )
         
         if ( stdout != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following standard output: \n' + 
                                          stdout )
         
         # NB: As the R function is susceptible to write not error-related 
         #     messages in stderr, these messages are also logged at the 
         #     debug level
         if ( stderr != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following error output: \n' + 
                                          stderr )
     
     # Sequentially open CSV files to get the relative positions
     # Instantiate a dictionary that associate to the ORFTranscriptAsso ID
     # the relative start and stop positions of the ORF
     rel_positions_dict = {}
     for file_nb in range( filename_suffix ):
         
         df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER,
                                         filename_prefix + str( file_nb ) + '.csv' ),
                           sep = ',',
                           encoding = 'utf-8' )
         
         for ( index, row ) in df.iterrows():
             rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] )
     
     
     # Add the relative start and stop positions for all the ORFTranscriptAsso entries 
     all_ota = SQLManagerPRO.get_instance().get_session().query( 
                                                                     ORFTranscriptAsso 
                                                                 ).filter( 
                                                                             ORFTranscriptAsso.id.in_( rel_positions_dict.keys() ) 
                                                                         ).all()
     for ota in all_ota:
         
         # Get the start and stop positions
         positions = rel_positions_dict.get( ota.id )
         rel_start_pos = positions[ 0 ] 
         rel_stop_pos = positions[ 1 ] 
         
         if not pd.isna( rel_start_pos ):
             ota.rel_start_pos = int( rel_start_pos )
         
         if not pd.isna( rel_stop_pos ):
             ota.rel_stop_pos = int( rel_stop_pos )
     
     # Commit the updates and close the session
     SQLManagerPRO.get_instance().commit()
     SQLManagerPRO.get_instance().close_session()
     
     # Delete the pool instance
     p.clear()
コード例 #28
0
ファイル: rpki_file.py プロジェクト: jfuruness/lib_bgp_data
class RPKI_File:
    """This class gets validity data from ripe"""

    __slots__ = ["path", "total_lines", "_process"]

    _dir = "/tmp/"
    hosted_name = "upo_csv_path.csv.gz"
    port = 8000

    def __init__(self, table_input):
        """Downloads and stores roas from a json"""

        self.path = self._dir + self.hosted_name.replace(".gz", "")
        with Unique_Prefix_Origins_Table(clear=True) as _db:
            _db.fill_table(table_input)
            _db.copy_table(self.path)
            self.total_lines = utils.get_lines_in_file(self.path)
            self._gzip_file()

#################################
### Context Manager Functions ###
#################################

    def __enter__(self):
        """What to do when the context manager is called on this class

        Starts the process for serving the file"""

        self.spawn_process()
        return self

    def __exit__(self, type, value, traceback):
        """Closes the file process"""

        self.close()

############################
### Serve File Functions ###
############################

    def spawn_process(self):
        """Spawns file serving process"""

        utils.kill_port(self.port)
        self._process = ProcessingPool()
        self._process.apipe(self._serve_file)
        logging.debug("Served RPKI File")

    def close(self):
        """Closes file process"""

        utils.kill_port(self.port, wait=False)
        self._process.close()
        self._process.terminate()
        self._process.join()
        self._process.clear()
        # changed to absolute path
        utils.delete_paths(os.path.join(self._dir, self.hosted_name))
        logging.debug("Closed RPKI File")

########################
### Helper Functions ###
########################

    def _gzip_file(self):
        """gzips the file for proper formatting in rpki validator"""

        with open(self.path, 'rb') as f_in, gzip.open(
                os.path.join(self._dir, self.hosted_name), 'wb') as f_out:

            f_out.writelines(f_in)

        utils.delete_paths(self.path)

    def _serve_file(self):
        """Makes a simple http server and serves a file in /tmp"""
        class Handler(http.server.SimpleHTTPRequestHandler):
            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)

        # Changes directory to be in /tmp
        os.chdir(self._dir)
        # Serve the file on port 8000
        socketserver.TCPServer(("", RPKI_File.port), Handler).serve_forever()
コード例 #29
0
def make_patches(data_root, patches_root, patch_size, outline_filled=None, remove_filled=False, min_widths=('def',),
                 mirror=True, rotations=(0,), translations=((0, 0),), distinguishability_threshold=.5, num_workers=0,
                 random_samples=None, leave_width_percentile=None):
    if num_workers != 0:
        from pathos.multiprocessing import cpu_count, ProcessingPool
        from pathos.threading import ThreadPool
        if num_workers == -1:
            optimal_workers = cpu_count() - 1
            workers_pool = ProcessingPool(optimal_workers)
        else:
            workers_pool = ProcessingPool(num_workers)
        print(f'Workers pool: {workers_pool}')

        savers_pool = ThreadPool(1)
        saving_patches_in_bg = savers_pool.amap(lambda a: None, [])
    else:
        workers_pool = 0

    path = lambda basename, origin, width='def', ori='def', rot=0, t=(0, 0): os.path.join(patches_root, basename,
                                                                                          '{}x{}'.format(*patch_size),
                                                                                          'width_{}'.format(width),
                                                                                          'orientation_{}'.format(ori),
                                                                                          'rotated_deg_{}'.format(rot),
                                                                                          'translated_{}_{}'.format(*t),
                                                                                          '{}_{}.svg'.format(*origin))

    orientations = ['def']
    if mirror:
        orientations.append('mir')

    if random_samples is not None:
        min_widths_all = deepcopy(min_widths)
        orientations_all = deepcopy(orientations)
        rotations_all = deepcopy(rotations)
        translations_all = deepcopy(translations)

    source_images = glob(os.path.join(data_root, '**', '*.svg'), recursive=True)
    for file in source_images:
        print('Processing file {}'.format(file))
        basename = file[len(data_root) + 1:-4]  # split data_root and extension

        vector_image = VectorImage.from_svg(file)
        if remove_filled:
            vector_image.remove_filled()
        if outline_filled is not None:
            vector_image.leave_only_contours(outline_filled)
        if leave_width_percentile is not None:
            vector_image.leave_width_percentile(leave_width_percentile)

        if random_samples is not None:
            min_widths = np.random.choice(min_widths_all, size=min(random_samples, len(min_widths_all)), replace=False)
            orientations = np.random.choice(orientations_all, size=min(random_samples, len(orientations_all)),
                                            replace=False)
            rotations = np.random.choice(rotations_all, size=min(random_samples, len(rotations_all)), replace=False)
            translations = translations_all[
                np.random.choice(len(translations_all), size=min(random_samples, len(translations_all)), replace=False)]

        for width in min_widths:
            print('\twidth {}'.format(width))
            if width == 'def':
                vector_image_scaled = vector_image
            else:
                vector_image_scaled = vector_image.copy()
                vector_image_scaled.scale_to_width('min', width)
            for orientation in orientations:
                print('\t\torientation {}'.format(orientation))
                if orientation == 'def':
                    vector_image_reoriented = vector_image_scaled
                else:
                    vector_image_reoriented = vector_image_scaled.mirrored()
                for rotation in rotations:
                    print('\t\t\trotation {}'.format(rotation))
                    vector_image_rotated = vector_image_reoriented.rotated(rotation, adjust_view=True)
                    for translation in translations:
                        print('\t\t\t\ttranslation {}'.format(translation))
                        vector_image_translated = vector_image_rotated.translated(translation, adjust_view=True)

                        vector_patches = vector_image_translated.split_to_patches(patch_size, workers=workers_pool)
                        if num_workers != 0:
                            print('\t\t\t\t\twaiting for previous batch to be saved')
                            saving_patches_in_bg.get()

                        def simplify_and_save(vector_patch, basename=basename, width=width, orientation=orientation,
                                              rotation=rotation, translation=translation):
                            vector_patch.simplify_segments(distinguishability_threshold=distinguishability_threshold)
                            if len(vector_patch.paths) == 0:
                                return
                            save_path = path(basename,
                                             (int(vector_patch.x.as_pixels()), int(vector_patch.y.as_pixels())), width,
                                             orientation, rotation, translation)
                            os.makedirs(os.path.dirname(save_path), exist_ok=True)
                            vector_patch.save(save_path)

                        if num_workers == 0:
                            print('\t\t\t\t\tsaving patches')
                            for vector_path in vector_patches.reshape(-1):
                                simplify_and_save(vector_path)
                        else:
                            print('\t\t\t\t\tsaving patches')
                            saving_patches_in_bg = savers_pool.amap(simplify_and_save, vector_patches.reshape(-1))

    if num_workers != 0:
        workers_pool.close()
        workers_pool.join()
        workers_pool.clear()

        savers_pool.close()
        savers_pool.join()
        savers_pool.clear()
コード例 #30
0
def build_full_hamiltonian_parallel1(clustered_ham_in,
                                     ci_vector_in,
                                     iprint=1,
                                     nproc=None,
                                     opt_einsum=True):
    """
    Build hamiltonian in basis in ci_vector

    parallelized over fock space blocks -- inefficient
    """
    # {{{
    global clusters
    global ci_vector
    global clustered_ham

    print(" In build_full_hamiltonian_parallel1. nproc=", nproc)

    clustered_ham = clustered_ham_in
    ci_vector = ci_vector_in
    clusters = clustered_ham_in.clusters

    H = np.zeros((len(ci_vector), len(ci_vector)))
    n_clusters = len(clusters)

    def compute_parallel_block(f):
        fock_l = f[0]
        fock_li = f[1]
        fock_r = f[2]
        fock_ri = f[3]

        diagonal = False
        if fock_l == fock_r:
            diagonal = True

        if fock_li > fock_ri:
            return

        #print("Processing the block: ")
        #print(fock_l,fock_r)

        configs_l = ci_vector[fock_l]
        configs_r = ci_vector[fock_r]

        Hblock = np.zeros((len(configs_l), len(configs_r)))

        delta_fock = tuple([(fock_l[ci][0] - fock_r[ci][0],
                             fock_l[ci][1] - fock_r[ci][1])
                            for ci in range(len(clusters))])
        try:
            terms = clustered_ham.terms[delta_fock]
        except KeyError:
            return
        for term in terms:
            # Compute the state sign now - since it only depends on fock spaces
            state_sign = 1

            term_exists = True
            for oi, o in enumerate(term.ops):
                if o == '':
                    continue
                if len(o) == 1 or len(o) == 3:
                    for cj in range(oi):
                        state_sign *= (-1)**(fock_r[cj][0] + fock_r[cj][1])

                # Check to make sure each cluster is allowed to make the requested transition
                try:
                    do = clusters[oi].ops[o]
                except:
                    print(" Couldn't find:", term)
                    exit()
                try:
                    d = do[(fock_l[oi], fock_r[oi])]
                    #d = do[(fock_bra[oi],fock_ket[oi])][bra[oi],ket[oi]] #D(I,J,:,:...)
                except:
                    term_exists = False
            if not term_exists:
                continue

            for config_li, config_l in enumerate(configs_l):
                idx_l = config_li
                #idx_l = fock_space_shifts[fock_li] + config_li
                for config_ri, config_r in enumerate(configs_r):
                    idx_r = config_ri
                    #idx_r = fock_space_shifts[fock_ri] + config_ri

                    if diagonal and idx_r < idx_l:
                        continue

                    # Check to make sure each cluster is diagonal except if active
                    allowed = True
                    for ci in range(n_clusters):
                        if (config_l[ci] !=
                                config_r[ci]) and (ci not in term.active):
                            allowed = False
                    if not allowed:
                        continue

                    me = term.matrix_element(fock_l, config_l, fock_r,
                                             config_r)
                    #                    #d = do[(fock_bra[oi],fock_ket[oi])][bra[oi],ket[oi]] #D(I,J,:,:...)
                    #                    mats = []
                    #                    for ci in term.active:
                    #                        mats.append( clusters[ci].ops[term.ops[ci]][(fock_l[ci],fock_r[ci])][config_l[ci],config_r[ci]] )
                    #
                    #                    me = 0.0
                    #
                    #                    if len(mats) != len(term.active):
                    #                        continue
                    #
                    #                    #check that the mats where treated as views and also contiguous
                    #                    #for m in mats:
                    #                    #    print(m.flags['OWNDATA'])  #False -- apparently this is a view
                    #                    #    print(m.__array_interface__)
                    #                    #    print()
                    #
                    #                    # todo:
                    #                    #    For some reason, precompiled contract expression is slower than direct einsum - figure this out
                    #                    #me = term.contract_expression(*mats) * state_sign
                    #                    me = np.einsum(term.contract_string,*mats,term.ints) * state_sign

                    Hblock[idx_l, idx_r] += me

                    if diagonal and idx_r > idx_l:
                        Hblock[idx_r, idx_l] += me
        return Hblock

    fock_space_shifts = [0]
    for fi, f in enumerate(ci_vector.fblocks()):
        configs_i = ci_vector[f]
        fock_space_shifts.append(fock_space_shifts[-1] + len(configs_i))

    fock_space_blocks = []
    for fock_li, fock_l in enumerate(ci_vector.data):
        for fock_ri, fock_r in enumerate(ci_vector.data):
            if fock_li > fock_ri:
                continue
            fock_space_blocks.append((fock_l, fock_li, fock_r, fock_ri))

    #for f in fock_space_blocks:
    #    compute_parallel_block(f)

    import multiprocessing as mp
    from pathos.multiprocessing import ProcessingPool as Pool

    if nproc == None:
        pool = Pool()
    else:
        pool = Pool(processes=nproc)

    def test(f):
        fock_l = f[0]
        fock_li = f[1]
        fock_r = f[2]
        fock_ri = f[3]

        if fock_li > fock_ri:
            return
        print(fock_l, fock_r)

        configs_l = ci_vector[fock_l]
        configs_r = ci_vector[fock_r]

    #pool.map(test, fock_space_blocks)
    Hblocks = pool.map(compute_parallel_block, fock_space_blocks)

    pool.close()
    pool.join()
    pool.clear()

    for fi, f in enumerate(fock_space_blocks):
        fock_l = f[0]
        fock_li = f[1]
        fock_r = f[2]
        fock_ri = f[3]
        start_l = fock_space_shifts[fock_li]
        stop_l = fock_space_shifts[fock_li + 1]
        start_r = fock_space_shifts[fock_ri]
        stop_r = fock_space_shifts[fock_ri + 1]

        if np.all(Hblocks[fi]) != None:
            H[start_l:stop_l, start_r:stop_r] = Hblocks[fi]
        if fock_l != fock_r:
            if np.all(Hblocks[fi]) != None:
                H[start_r:stop_r, start_l:stop_l] = Hblocks[fi].T
            #try:
            #if np.all(Hblocks[fi]) != None:
            #try:
            #    H[start_r:stop_r,start_l:stop_l] = Hblocks[fi].T
            #except:
            #    pass

    return H