コード例 #1
0
def GroupByParallelProcess(tweetsDF, cores, groupMethod):
    """
    Group by and aggregate on time via a parallel process
    """

    tweetsDF.label_date = tweetsDF.label_date.astype(int)
    tweetsDF = tweetsDF.set_index("label_date")
    # Parallelizing using Pool.apply()
    df_split = GetListOfSplitDFs(tweetsDF, cores)
    # create the multiprocessing pool
    pool = Pool(cores)
    # process the DataFrame by mapping function to each df across the pool
    logging.info("Starting the grouping and aggregating process.")
    if groupMethod == "weighted-average":
        df_out = pool.map(PerformGroupbyAndAggregate, df_split)
    elif groupMethod == "sum":
        df_out = pool.map(PerformSum, df_split)
    elif groupMethod == "mean":
        df_out = pool.map(PerformMean, df_split)
    else:
        logging.error("Choose correct group by method.")
        return None

    # close down the pool and join
    pool.close()
    pool.join()
    pool.clear()

    logging.info("Ended the grouping and aggregating process.")

    return df_out
コード例 #2
0
ファイル: submission.py プロジェクト: ruanchaves/BERT-WS
def parallelize_dataframe(df, func, n_cores=16):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
コード例 #3
0
    def multi_word_cut(self, sentences):
        print('Multiprocessing Word cut ')
        if self.language == 'ch':
            jieba.initialize(
            )  # initialize first, or it will initialize in each process
            jieba.disable_parallel()

            def func(line):
                line = [i.strip() for i in jieba.cut(line, cut_all=False)]
                return [
                    i for i in line
                    if ((not i.isdigit()) and (i not in self.stop_words))
                ]
        else:

            def func(line):
                return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \
                                                       (i not in self.stop_words) and \
                                                       (len(i) >1 ) )]

        pool = Pool(nodes=5)
        t0 = time.time()
        word_cut = pool.map(func, sentences)
        pool.close()
        pool.join()
        pool.clear()
        print('MultiProcess  time {:.0f}'.format(time.time() - t0))
        return word_cut
コード例 #4
0
def perplexity(lang="eng"):
    """
    finds satistical perplexity of the language model in Google Books N-Gram
    dataset.
    """
    pool = ProcessingPool(4)
    unigram_counter, mgram_counter, ngram_counter= pool.map(get_ngram_counter,
                                              [1,2,3],
                                              [lang] * 3)
    pool.close()
    pool.join()

    total_words = np.sum(np.array(list(unigram_counter.values())))
    print("total_words = ", total_words)

    ngram_conditionals = get_ngram_conditionals(ngram_counter,
                                            mgram_counter)

    probs = np.power(np.array(list(ngram_conditionals.values()),
                              dtype=np.float64),
             -np.array(list(ngram_counter.values()), dtype=np.float64) \
             / total_words)

    print("probs shape = ", probs.shape)

    PP = (np.prod(probs, dtype=np.float64))

    return PP
コード例 #5
0
ファイル: data.py プロジェクト: MunKeat/cs4224f-mongodb
 def preprocess(self):
     # Check if orderline should be extracted
     extract_orderline = conf['extract_orderline']
     pool = Pool()
     start = time.time()
     # Run in parallel
     if extract_orderline:
         res_orderline = pool.apipe(self.create_orderline,
                                    return_dataframe=False)
     res_warehouse = pool.apipe(self.create_warehouse)
     res_district = pool.apipe(self.create_district)
     res_order = pool.apipe(self.create_order)
     res_customer = pool.apipe(self.create_customer)
     res_stock = pool.apipe(self.create_stock)
     # Consolidate result
     pool.close()
     pool.join()
     list_of_processed_files = [res_warehouse.get(), res_district.get(),
                                res_order.get(), res_customer.get(),
                                res_stock.get()]
     if extract_orderline:
         list_of_processed_files.append(res_orderline.get())
     end = time.time()
     self.debug("Preprocessing of csv file took {}s".format(end - start))
     return list_of_processed_files
コード例 #6
0
    def multi_Non_Tweep_friends(self, handle):
        min_position, links = self.get_tweets(handle)
        print("Scraping last 100 days of activity")

        while (True):
            min_position1, links1 = self.get_tweets(handle, min_position)
            links = links + links1
            if (min_position1 == None):
                break
            min_position = min_position1

        people_list = []

        link = [x for x in links if handle in x]
        link = self.duplicates(link)

        p = Pool(10)  # Pool tells how many at a time
        with Pool(10) as p:

            records = list(tqdm(p.imap(self.get_people, link),
                                total=len(link)))
            p.terminate()
            p.join()
            p.close()
            people_list = [item for sublist in records for item in sublist]
            people_list = self.duplicates(people_list)

        people_list = [x for x in people_list if x != handle]

        return (people_list)
コード例 #7
0
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\
    clique_finder=greedy_find_clique_number, multi=False):
    """Plots a graph of number of colours against edge probability,
    for each of the various lower/upper bounds of chromatic number
    multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true)
    """
    probs = np.arange(prange[0], prange[1], pstep)
    graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs]
    mean_bounds = []
    pool = Pool(multi if type(multi) is int else 4)
    # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs)
    f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list))
    graph_generator = pool.imap(f, graphs) if multi else map(f, graphs)

    for bounds in tqdm.tqdm(graph_generator, total=len(graphs)):
        mean_bounds.append(np.mean(bounds, axis=0))

    pool.close()
    pool.join()

    mean_bounds = np.array(mean_bounds)
    plt.figure()
    for i, label in zip(range(mean_bounds.shape[1]), \
        ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']):
        plt.plot(probs, mean_bounds[:, i], label=label)
    plt.legend()

    return probs, mean_bounds
コード例 #8
0
class ConsensusMHSampler(MHSampler):
	def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1):
		super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations)
		self.shards = shards

		assert len(self.log_distribution_fn) == self.shards
		self.log_fn_dict = {} # for pickling purposes
		for i in range(self.shards):
			self.log_fn_dict[i] = self.log_distribution_fn[i]

		self.pool = Pool(nodes=self.shards)

	def sample(self):
		map_results = self.pool.map(self.map_sample, range(self.shards))
		self.pool.close()
		self.pool.join()
		self.pool.terminate()
		self.pool.restart()
		self.saved_states = self.reduce_sample(map_results)

	def map_sample(self, index):
		np.random.seed(1)
		cur_state = self.start_state
		sample_results = [cur_state]
		prob, count = 0, 0

		for i in range(self.iterations):
			if i % 5000 == 0:
				print("iteration {}".format(i))
			candidate_state = self.get_transition_sample(cur_state)
			acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index])
			prob += acceptance
			count += 1

			new_state = self.transition_step(cur_state, candidate_state, acceptance)
			sample_results.append(new_state)
			cur_state = new_state
		sample_results = np.array(sample_results)

		print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count))

		return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results)))

	def get_sample_variance(self, data):
		return np.linalg.norm(np.var(np.array(data), axis=0))

	def reduce_sample(self, results):
		'''
			results is a list of (sample_array, weight) tuples
		'''
		sample_results = 0
		total_weight = 0
		for sample, weight in results:
			sample_results += weight * sample
			total_weight += weight

		return sample_results / total_weight
コード例 #9
0
def test_multiprocess():
    x_list = [1,2,3,4,5,6,7,]
    y_list = ['1','2','3','4','5','6','7']
    epoch = 8
    pool = Pool(epoch)
    res = pool.amap(test_task,x_list,y_list)
    pool.pipe(test_task,'22','222')
    pool.close()
    pool.join()
コード例 #10
0
def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame:
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    # have to include this to prevent leakage and allow multiple parallel function calls
    pool.terminate()
    pool.restart()
    return df
コード例 #11
0
ファイル: spatial_utils.py プロジェクト: bubalis/VT_pIndex
def parallelize(data, func, num_of_processes=8):
    '''Function for paralellizing any function on a dataframe.
    Stolen from stack overflow, user Tom Raz:
    https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply'''
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
コード例 #12
0
def goo():
    pool = Pool(4)
    #    def f(x):
    #        return foo(100 + x)
    stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20))
    print(stuff)
    print('aaa')
    pool.close()
    pool.join()
    print('bbb')
コード例 #13
0
ファイル: helper_funcs.py プロジェクト: jfuruness/lib_utils
def Pool(cpus=cpu_count()) -> ProcessingPool:
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with processes
    p = ProcessingPool(cpus)
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #14
0
def parallelize_dataframe(df,
                          func,
                          num_partitions=num_cores,
                          num_cores=num_cores):
    df_split = np.array_split(df, num_partitions, axis=0)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    pool.clear()
    return df
コード例 #15
0
ファイル: utils.py プロジェクト: hoidn/packages
def parallelmap(func, data, nodes = None):
    """
    Return the averaged signal and background (based on blank frames) over the given runs
    """
    if not nodes:
        nodes = multiprocessing.cpu_count() - 2
    pool = ProcessingPool(nodes=nodes)
    try:
        return pool.map(func, data)
    except KeyboardInterrupt:
        pool.terminate()
        pool.join()
コード例 #16
0
ファイル: utils.py プロジェクト: jfuruness/lib_bgp_data
def Pool(threads: int, multiplier: int, name: str):
    """Context manager for pathos ProcessingPool"""

    # Creates a pool with threads else cpu_count * multiplier
    p = ProcessingPool(threads if threads else cpu_count() * multiplier)
    logging.debug(f"Created {name} pool")
    yield p
    # Need to clear due to:
    # https://github.com/uqfoundation/pathos/issues/111
    p.close()
    p.join()
    p.clear()
コード例 #17
0
def finally_ip():
	(ipList,portList) = get_ip_list(url,headers)
	#运行pool = ThreadPool(2)有时会出现module '__main__' has no attribute '__spec__'错误  不造如何解决
	#尝试过 __spec__=None 的方式  没有什么用
	pool = ThreadPool(4)
	start_time = time.time()
	results = pool.map(test_ip,ipList,portList)
	pool.close()
	pool.join()
	end_time = time.time()
	print("并行耗时:"+str(end_time-start_time))
	return results
コード例 #18
0
def compute_scores(args):
    results_file = args.results_file
    scores_file = args.score_file
    num_captions = args.num_captions
    is_exp = args.exp
    generated_image_tokens = get_generated_tokens(res_file=results_file,
                                                  num=num_captions)
    gt_image_tokens = get_gt_tokens()
    print('number of test images: %d, all images: %d') % (
        len(generated_image_tokens), len(gt_image_tokens))
    all_image_ids = generated_image_tokens.keys()

    def f(image_id_thread):
        image_ids = image_id_thread[0]
        thread_num = image_id_thread[1]
        scores = {}
        for image_id in image_ids:
            res_tokens = generated_image_tokens[str(image_id)]
            gt_tokens = gt_image_tokens[str(image_id)]
            wmd_score = word_mover_distance(res_tokens,
                                            gt_tokens,
                                            wvmodel=wvmodel,
                                            is_exp=is_exp)
            scores[image_id] = wmd_score
            print('Thread: %d, Image ID: %s, WMD score: %.5f') % (
                thread_num, image_id, wmd_score)
        return scores

    num_images = len(all_image_ids)
    num_workers = 20
    num_per_split = num_images // num_workers
    images_split = []
    for i in range(num_workers):
        if i == (num_workers - 1):
            images_split.append([all_image_ids[(i * num_per_split):], i])
        else:
            images_split.append([
                all_image_ids[(i * num_per_split):((i + 1) * num_per_split)], i
            ])
    pool = Pool(num_workers)
    all_scores = pool.map(f, images_split)
    pool.close()
    pool.join()
    scores = {}
    for s in all_scores:
        scores.update(s)
    with open(scores_file, 'w') as f:
        json.dump(scores, f)
    total_score = 0
    for key in scores.keys():
        total_score += scores[key]
    print('WMD score: %.5f') % (total_score / len(scores))
コード例 #19
0
ファイル: initsync_pipe.py プロジェクト: iagcl/data_pipeline
def parallelise_initsync(argv, ssp_params, process_control_id, logger):
    # Pivot the collection of source_system_profile records into
    # three separate lists to enable us to call pool.map on each record
    (source_schemas, tables, target_schemas,
     query_conditions) = map(list, zip(*ssp_params))

    source_conn_detail = dbuser.get_dbuser_properties(argv.sourceuser)
    target_conn_detail = dbuser.get_dbuser_properties(argv.targetuser)

    logger.info("Processing tables with {} dedicated worker processes".format(
        argv.numprocesses))
    pool = Pool(nodes=argv.numprocesses)

    argvs = [argv] * len(tables)
    source_conn_details = [source_conn_detail] * len(tables)
    target_conn_details = [target_conn_detail] * len(tables)
    pcids = [process_control_id] * len(tables)
    queues = [manager.Queue()] * len(tables)

    logger.debug("Starting a new process for each table in: {tables}".format(
        tables=tables))
    # Execute initsync for each schema/table combination in parallel
    pool.map(initsync_table,
             argvs,
             source_conn_details,
             target_conn_details,
             source_schemas,
             tables,
             target_schemas,
             pcids,
             query_conditions,
             queues,
             chunksize=1)  # Ensure tables are processed in sequence
    # and workers are fully utilised

    pool.close()
    logger.debug("parallelise_initsync: Pool joining")
    pool.join()
    logger.debug("parallelise_initsync: Pool joined")

    all_table_results = {}
    for q in queues:
        size = q.qsize()
        message = q.get()
        logger.debug("Message queue size = {s}, message = {m}".format(
            s=size, m=message))
        all_table_results.update(message)

    logger.debug("all_table_results = {r}".format(r=all_table_results))
    return all_table_results
コード例 #20
0
ファイル: multiprocessing.py プロジェクト: notsoprocoder/mupa
 def parallel_apply(self, df, func):
     # add try statement re function not returning a DataFrame
     if self.preprocessing_checks(df, func):
         # split DataFrame into a list of smaller DataFrames
         self.df_split = np.array_split(df, self.partitions, axis=0)
         # create the multiprocessing pool
         pool = Pool(self.cores)
         # process the DataFrame by mapping function to each df across the pool
         df = pd.concat(pool.map(func, self.df_split), axis=0).copy()
         # close down the pool and join
         pool.close()
         pool.join()
         pool.clear()
         return df
コード例 #21
0
def get_gt_tokens(
        coco_file='../data/files/dataset_coco.json',
        coco_tokens_file='../data/files/coco_tokens_Google_news.json'):
    if os.path.exists(coco_tokens_file):
        with open(coco_tokens_file, 'r') as f:
            dataset = json.load(f)
        return dataset

    print 'Processing ground-truth data...'
    with open(coco_file, 'r') as f:
        dataset = json.load(f)

    def f(images):
        image_tokens = {}
        # images = dataset['images']
        for image in images:
            sentence = image['sentences']
            image_id = str(image['cocoid'])
            tokens = []
            for s in sentence:
                tokens.extend(s['tokens'])
            filter_token = []
            for token in tokens:
                if (token not in stop_words) and (token in vocab):
                    filter_token.append(token)
            # tokens = [token for token in tokens if token not in stop_words and token in vocab]
            image_tokens[image_id] = filter_token
        return image_tokens

    all_images = dataset['images']
    num_images = len(all_images)
    num_workers = 30
    num_per_split = num_images // num_workers
    images_split = []
    for i in range(num_workers):
        if i == (num_workers - 1):
            images_split.append(all_images[(i * num_per_split):])
        else:
            images_split.append(
                all_images[(i * num_per_split):((i + 1) * num_per_split)])
    pool = Pool(num_workers)
    all_images_tokens = pool.map(f, images_split)
    pool.close()
    pool.join()
    all_token_dict = {}
    for d in all_images_tokens:
        all_token_dict.update(d)
    with open(coco_tokens_file, 'w') as f:
        json.dump(all_token_dict, f)
    return all_token_dict
コード例 #22
0
def parallelmap(func, lst, nodes=None):
    """
    Return the averaged signal and background (based on blank frames) over the given runs using
    multiprocessing (as opposed to MPI).
    """
    from pathos.multiprocessing import ProcessingPool
    from pathos import multiprocessing
    if not nodes:
        nodes = multiprocessing.cpu_count() - 2
    pool = ProcessingPool(nodes=nodes)
    try:
        return pool.map(func, lst)
    except KeyboardInterrupt:
        pool.terminate()
        pool.join()
コード例 #23
0
ファイル: utils.py プロジェクト: hoidn/utils
def parallelmap(func, lst, nodes = None):
    """
    Return the averaged signal and background (based on blank frames) over the given runs using
    multiprocessing (as opposed to MPI).
    """
    from pathos.multiprocessing import ProcessingPool
    from pathos import multiprocessing
    if not nodes:
        nodes = multiprocessing.cpu_count() - 2
    pool = ProcessingPool(nodes=nodes)
    try:
        return pool.map(func, lst)
    except KeyboardInterrupt:
        pool.terminate()
        pool.join()
コード例 #24
0
    def make_query(self, size=1):

        ## quit if nr_unlabeled_samples = 1
        if self.dataset.len_unlabeled() == 1:
            return self.dataset.get_unlabeled_entries()[0].astype(int)

        ## Set the possible labels
        self.possible_labels = list(set(self.dataset.get_labeled_entries()[1]))

        ## Train the model
        self.model.train(self.dataset)

        ## Get probabilities
        X_ids, X = self.dataset.get_unlabeled_entries()
        pred = self.model.predict_proba(
            X)  # pred.shape = (n_unlabeled, nr_of_labels)

        ## Setup pool for cpu parallelisation
        p = Pool(cpu_count(), maxtasksperchild=1000)

        ## nr of unlabeled samples -> len(X)

        ## Get uncertainty after adding every sample with every label
        total = np.asarray(
            p.map(self._eer, X_ids,
                  len(X) * [self.dataset],
                  len(X) * [self.depth]))
        # total.shape = (n_unlabeled, nr_of_labels)

        ## Close the Pool again
        p.close()
        p.join()
        p.clear()

        ## Get the total uncertainty of one sample after adding a label weighted by the labels probability
        total = np.inner(
            pred,
            total,
        ).diagonal()  # total.shape = (n_unlabeled,)

        ## Zip it
        total = zipit(X_ids, total)

        ## Sort it
        results = sort_by_2nd(total, 'min')

        return results[:size, 0].astype(int)
コード例 #25
0
    def start(self, text_data_dir, res_dir, nprocs=8):
        '''
        entry function
    
        text_data_dir: folder of raw data
        text_res_dir: folder of output
        verbose: int. Information is printed every N records
        nprocs: number of cores in parallel
        '''
        p = PathosPool(nprocs)

        filepathsvec, filenamesvec, respaths = list(), list(), list()
        for dirpath, _, filenames in os.walk(text_data_dir):
            for filename in filenames:
                if (("gz" in filename) and ('md5' not in filename)
                        and ('copy' not in filename)):
                    filepath = os.path.join(dirpath, filename)
                    print(filepath)
                    res_name = filename.split(".")[0] + ".csv.gz"
                    respath = os.path.join(res_dir, res_name)
                    #if os.path.exists(respath):
                    # pass
                    #else:
                    if True:
                        filepathsvec.append(filepath)
                        filenamesvec.append(filename)
                        respaths.append(respath)
                        #p.apply_async(process_data, args = (filepath,filename,
                        # respath, True,
                        # [title_stop_path,
                        #  affil_stop_path,
                        #  mesh_stop_path]))
        self.affildicts = p.amap(
            partial(self.process_data,
                    stop_paths=[
                        self.title_stop_path, self.affil_stop_path,
                        self.mesh_stop_path
                    ],
                    rm_stopwords=True,
                    affiliation_correction=True,
                    select_journals=self.select_journals), filepathsvec,
            filenamesvec, respaths)

        p.close()
        p.join()  # Having an issue joining
        print("joined")
        p.clear()  # Delete the pool
コード例 #26
0
    def _mp_improve(self, container, scenario_builder):
        """Improves b/2 best solutions from the container and updates
        the score table with the generated solutions
        """
        container.sort()
        pool = Pool(processes=self._proc_count)

        logging.info("Starting processes")
        start = datetime.now()
        best = []
        builders = []
        for i in range(self._b/2):
            best.append(container.get(i))
            builders.append(scenario_builder)

        try:
            result = pool.map(self._improve, best, builders)
            pool.close()
            pool.join()
        except MemoryError as e:
            send_email("I crashed again, please help!")
            import pudb
            pudb.set_trace()
            print(e.message())

        logging.info("Processes finished - %s" % (datetime.now() - start))
        # How infuriating was that?!
        # pathos was being smart and was caching pool so this is needed
        # to prevent from erroring out
        pool.restart()

        start = datetime.now()
        logging.info("mp_improve second loop")
        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("mp_improve second loop - %s" % (datetime.now() - start))
        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container
コード例 #27
0
ファイル: experiment.py プロジェクト: hobotrl/hobotrl
class ParallelGridSearch(Experiment):

    param_queue = []

    def __init__(self, exp_class, parameters, parallel=4):
        """
        :param exp_class: subclass of Experiment to run
        :type exp_class: class<Experiment>
        :param parameters: dict of list, experiment parameters to search within, i.e.:
            {
                "entropy": [1e-2, 1e-3],
                "learning_rate": [1e-3, 1e-4],
                ...
            }
            or list of dict-of-list, representing multiple groups of parameters:
            [
            {
                "entropy": [1e-2, 1e-3],
                "learning_rate": [1e-3, 1e-4],
                ...
            },
            {
                "batch_size": [32, 64],
                ...
            }
            ]

        """
        super(ParallelGridSearch, self).__init__()
        self._exp_class, self._parameters, self._parallel = exp_class, parameters, parallel

    def run(self, args):
        self.log_root = args.logdir
        for parameter in GridSearch.product(self._parameters):
            label = GridSearch.labelize(parameter)
            ParallelGridSearch.param_queue.append(
                [self._exp_class, self.log_root, parameter, label, args])
        n = len(ParallelGridSearch.param_queue)
        task_index = list(range(n))
        logging.warning("total searched combination:%s", n)
        self.pool = Pool(self._parallel)
        ret = self.pool.amap(subprocess_run, task_index)
        ret.wait()
        self.pool.close()
        self.pool.join()
コード例 #28
0
def combine_scores():
    """Combine the scores from all patients and dump into all_dict.txt.
    """

    all_dicts = {}
    duration_dict = {}
    all_dict_q = multiprocessing.Manager().Queue()
    duration_dict_q = multiprocessing.Manager().Queue()
    dirs = [
        y for y in os.listdir(patient_dir)
        if os.path.isdir(os.path.join(patient_dir, y))
    ]
    bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(dirs))
    f = functools.partial(scores_and_duration_dict, all_dict_q,
                          duration_dict_q)
    p = Pool()

    for i, _ in enumerate(p.imap(f, dirs, chunksize=50), 1):
        bar.update(i)
    p.close()
    p.join()

    while not all_dict_q.empty():
        patient_dict = all_dict_q.get()
        dur_dict = duration_dict_q.get()

        for i in patient_dict:
            print(i)

            if i not in all_dicts:
                all_dicts[i] = patient_dict[i]
            else:
                all_dicts[i].update(patient_dict[i])

        for i in dur_dict:
            print(i)

            if i not in duration_dict:
                duration_dict[i] = dur_dict[i]
            else:
                duration_dict[i].update(dur_dict[i])
    print('done combining scores, dumping...')
    json.dump(all_dicts, open(os.path.join(patient_dir, 'all_dict.txt'), 'w'))
    json.dump(duration_dict,
              open(os.path.join(patient_dir, 'duration_dict.txt'), 'w'))
コード例 #29
0
ファイル: stalker.py プロジェクト: abhidya/Rewteet_graph
    def multi_get_followers_location(self, followers_ids, amount=-1,
                                     workers=10):  # takes list of screen names returns dict of location counts
        locations = {}
        if amount != -1:
            followers_ids = random.sample(followers_ids, amount)

        p = Pool(workers)  # Pool tells how many at a time
        records = p.map(self.get_loc, followers_ids)
        p.terminate()
        p.join()
        print(records)

        for i in records:
            if i not in locations:
                locations[i] = 0
            locations[i] = locations[i] + 1

        return locations
コード例 #30
0
class ProcessPool():
    def __init__(self, maxprocesses):
        self.pool = Pool(processes=maxprocesses)
        logger.info("Initialized process pool of size {}".format(maxprocesses))

    def start(self, fuction, local_dirs, remote_dirs):
        atexit.register(self.exit)
        logger.info(
            "Starting parallel rsync from a total of {} dirs ({} ...)".format(
                len(remote_dirs), remote_dirs[:3]))
        logger.info("Started with parallelization")
        results = self.pool.map(fuction, list(zip(local_dirs, remote_dirs)))
        logger.info("Stopped with results: {}".format(results))
        return results

    def exit(self):
        self.pool.close()
        self.pool.join()
コード例 #31
0
ファイル: annotation.py プロジェクト: WingCHWang/KG-Tools
        def wrapper(*args, **kwargs):

            obj, data, _args = tuple(), tuple(), tuple()
            if hasattr(args[0].__class__, fn.__name__):
                obj, data, *_args = args
                obj = (obj, )
            else:
                data, *_args = args

            if type(data) != list:
                data = list(data)

            total_size = len(data)
            _batch_size = total_size // workers + 1 if batch_size is None else batch_size
            # assert type(data) == list, "Type of data must be list"
            print(
                f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: parallel for {fn.__qualname__}."
            )

            if shuffle:
                print(
                    f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: shuffle data for {fn.__qualname__}."
                )
                random.shuffle(data)

            pool = Pool(workers)
            pool.terminate()
            pool.restart()

            proc = []
            for beg, end in zip(
                    range(0, total_size, _batch_size),
                    range(_batch_size, total_size + _batch_size, _batch_size)):
                batch = data[beg:end]
                p = pool.apipe(fn, *obj, batch, *_args, **kwargs)
                proc.append(p)
            pool.close()
            pool.join()

            result = reduce_seqs([p.get() for p in proc])
            if after_hook is not None:
                result = after_hook(result)

            return result
コード例 #32
0
ファイル: auto_update.py プロジェクト: jankiwtf/Judge_bot
def get_result():
    success = []
    errors = []
    updates = {'success': success, 'errors': errors}

    keys = url_db.sql_saved_keys()
    list_keys = [key for key in keys]
    pool = Pool()
    results = pool.map(check_key, list_keys)
    for key in results:
        if key['bool']:
            success.append(key['update'])
        elif key['bool'] is False:
            errors.append(key['link'])
        else:
            continue
    pool.close()
    pool.join()
    return updates