Beispiel #1
0
def parse(document, pages, parse_refs=True,
        progress_monitor=NullProgressMonitor(),
        pool_size=DEFAULT_POOL_SIZE):
    progress_monitor.start('Parsing Pages', pool_size + 1)

    # Prepare input
    pages = [(page.local_url, page.url) for page in
            pages.values() if page.local_url is not None]
    pages_chunks = chunk_it(pages, pool_size)
    inputs = []
    for pages_chunk in pages_chunks:
        inputs.append((document.parser, document.pk, parse_refs, pages_chunk))

    # Close connection to allow the new processes to create their own.
    connection.close()

    # Split work
    progress_monitor.info('Sending {0} chunks to worker pool'
            .format(len(inputs)))
    pool = Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs, 1):
        progress_monitor.work('Parsed 1/{0} of the pages'.\
                format(pool_size), 1)

    # Word Count
    word_count = 0
    for page in document.pages.all():
        word_count += page.word_count
    document.word_count = word_count
    document.save()
    progress_monitor.work('Counted Total Words', 1)

    pool.close()
    progress_monitor.done()
Beispiel #2
0
def test_stemming():
    with open("tests.txt") as file:
        pool = Pool(4)
        results = pool.map(validate, file)
    for result in results:
        if result:
            yield assert_output, result[0], result[1]
Beispiel #3
0
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000):
    print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath)
    all_labels = []
    label_dict = {}

    filenames_chunks = util.chunks(filenames, chunk_size)

    for i, chunk in enumerate(filenames_chunks):
        pool = Pool(processes=util.CPU_COUNT)
        chunk_labels = pool.map(extract_labels, chunk)
        pool.close()

        for filepath, labels in zip(chunk, chunk_labels):
            if labels is not None:
                file_id = util.filename_without_extension(filepath)
                label_dict[file_id] = labels
                all_labels += labels

        print i+1, '/', len(filenames_chunks)

    #Write labels to file
    with open(out_filepath,'w') as f:
        pickle.dump(label_dict, f)

    print '\nLabels:'
    print len(set(all_labels))
    print Counter(all_labels)
Beispiel #4
0
    def start(self):
        """Starts a server that controls local workers.

        Calling this function starts a pool of `num_workers` workers used to run
        targets sent to the server. The server will run indefinitely unless shut
        down by the user.
        """
        try:
            serv = Listener((self.hostname, self.port))
            workers = Pool(
                processes=self.num_workers,
                initializer=Worker,
                initargs=(self.status, self.queue, self.waiting),
            )

            logging.info(
                "Started %s workers, listening on port %s",
                self.num_workers,
                serv.address[1],
            )
            self.wait_for_clients(serv)
        except OSError as e:
            if e.errno == 48:
                raise ServerError(
                    (
                        "Could not start workers listening on port {}. "
                        "The port may already be in use."
                    ).format(self.port)
                )
        except KeyboardInterrupt:
            logging.info("Shutting down...")
            workers.close()
            workers.join()
            self.manager.shutdown()
Beispiel #5
0
 def __init__(self):
     try:
         cpus = cpu_count()
     except NotImplementedError:
         cpus = 1
     self._taskqueue = Queue(maxsize=(2 * cpus))
     Pool.__init__(self)
Beispiel #6
0
 def check(self, artdict):
     print("Checking for infobox existence")
     pool = Pool(processes=100)
     revs = []
     for a in artdict:
         rev = artdict[a]["Revision"].split('oldid=')[1].strip()
         revs.append((a, rev))
     texts = dict(pool.map(self.get_text, revs))
     for a in artdict:
         text = texts[a]
         if text is None:
             artdict[a]["MultiInfobox"] = 0
             artdict[a]["Infobox programming language"] = -1
             artdict[a]["Infobox software"] = -1
             artdict[a]["Infobox file format"] = -1
         else:
             if 'infobox programming language' in text.lower():
                 artdict[a]["Infobox programming language"] = text.lower().index('infobox programming language')
             else:
                 artdict[a]["Infobox programming language"] = -1
             if 'infobox software' in text.lower():
                 artdict[a]["Infobox software"] = text.lower().index('infobox software')
             else:
                 artdict[a]["Infobox software"] = -1
             if 'infobox file format' in text.lower():
                 artdict[a]["Infobox file format"] = text.lower().index('infobox file format')
             else:
                 artdict[a]["Infobox file format"] = -1
             artdict[a]["MultiInfobox"] = text.lower().count("{{infobox")
     return artdict
def _get_data(data_folder = "data/crcm4_data", v_name = "pcp",
              member_list = None, year_range = None, months = None):
    """
    returns seasonal means of each year for all members in the list
    Note!: uses caching
    """
    year_range = list(year_range)
    cache_file = "_".join(member_list) + "_" + "_".join(map(str, months)) + \
                 "_{0}_from_{1}_to_{2}_cache.bin".format(v_name, year_range[0], year_range[-1])



    if os.path.isfile(cache_file):
        return pickle.load(open(cache_file))

    p = Pool(processes=len(member_list))

    #prepare input for the parallel processes
    m_folders = map(lambda x: os.path.join(data_folder,"{0}_p1{1}".format(x, v_name)), member_list)
    year_ranges = [year_range] * len(member_list)
    months_for_p = [months] * len(member_list)
    #calculate means
    result = p.map(_get_annual_means_for_year_range_p, zip(m_folders, year_ranges, months_for_p))

    result = np.concatenate(result, axis = 0) #shape = (n_members * len(year_range)) x nx x ny
    print result.shape

    pickle.dump(result, open(cache_file, "w"))
    return result
Beispiel #8
0
    def sum_lines(self, SPEEDUP=True):
        filesname = []
        for item_dir in self.dirlist.keys():
            for item_file in self.dirlist[item_dir][1]:
                filesname.append(item_dir + '/' + item_file)

        if SPEEDUP:
            # when python version is less then 3.3, multiprocessing.pool.Pool
            # don't support the context management protocol
            if sys.version_info.major is 3 and sys.version_info.minor >= 3:
                with Pool(self.MAX_RES) as res_pool:
                    return reduce(self._adder, res_pool.map(self._count_filelines, filesname))
            else:
                # in python2.x(maybe python3.[0-2]),
                # multiprocessing must pickle things to sling them among processes,
                # and bound methods are not picklable.
                # the workaround (whether you consider it "easy" or not;-) is to
                # add the infrastructure to your program to allow such methods to be pickled,
                # registering it with the copy_reg standard library method.
                # the following is a elusion to make it work in python2.x
                res_pool = Pool(processes=self.MAX_RES)
                retval = res_pool.map(_filecounter, filesname)
                return reduce(self._adder, retval)
        else:
            for filename in filesname:
                with open(filename, 'rb') as filebuf:
                    self.filesline += len(filebuf.readlines())
            return self.filesline
Beispiel #9
0
def main_canesm2_rcp85():
    samples_dir_p = Path("/RECH/data/Simulations/CRCM5/North_America/NorthAmerica_0.44deg_CanRCP85_B1/Samples")

    out_dir_root = Path("/RECH2/huziy/BenAlaya/")


    if samples_dir_p.name.lower() == "samples":
        out_folder_name = samples_dir_p.parent.name
    else:
        out_folder_name = samples_dir_p.name


    varnames = ["PR", integrated_wv_RPN_name]  # Total precipitation m/s; integrated ice, liquid water and vapor (in kg/m**2) averaged over last MOYHR

    # ======================================

    out_dir_p = out_dir_root.joinpath(out_folder_name)

    if not out_dir_p.is_dir():
        out_dir_p.mkdir()


    inputs = []
    for y in range(2006, 2101):
        inputs.append(dict(year=y, varnames=varnames, samples_dir=samples_dir_p, out_dir=out_dir_p, target_freq_hours=6, calendar_str="365_day"))

    # Extract the data for each year in parallel
    pool = Pool(processes=3)
    pool.map(extract_data_for_year_in_parallel, inputs)
Beispiel #10
0
def run():
    setup_logger()
    logger.info('Started')
    queue = multiprocessing.Queue(maxsize=EVENT_QUEUE_MAX_SIZE)
    pool = Pool(processes=WORKERS,
            initializer=worker,
            initargs=(queue,))

    event_handler = EventHandler(queue)
    observer = init_observer()
    try:
        delete_all_files(FRAMES_PATH)
        observer.schedule(event_handler, path=FRAMES_PATH, recursive=True)
        signal.signal(signal.SIGINT, signal_handler)
        observer.start()

        while True:
            pool._maintain_pool() #restart workers if needed
            time.sleep(1)
            now = datetime.datetime.now()
            if now - event_handler.last_event > datetime.timedelta(minutes=1):
                logger.warning("No events received in the last minute.")
                # Sometimes watchdog stops receiving events.
                # We exit, so the process can be restarted.
                break
    except KeyboardInterrupt as err:
        logger.warning("Keyboard interruption")
    except Exception as err:
        logger.exception(err)
    finally:
        observer.stop()
    observer.join()
    pool.terminate()
    logger.warning("Bye")
Beispiel #11
0
def parallel_main():
    recs = sys.stdin.readlines()
    vals = [int(rec) for rec in recs]
    p = Pool()
    results = p.map(solve, vals)
    for v1, v2 in results:
        print("{} {}".format(v1, v2))
def get_urls1():
	f2 = open('app_links.txt','r')

	nprocs = 100 # nprocs is the number of processes to run
	ParsePool = Pool(nprocs)
	#ParsePool.map(btl_test,url)
	ParsedURLS = ParsePool.map(urlsDeatilsExtract,f2)
Beispiel #13
0
def run_parallel(num_processes, experiment_names, methods, sparsity_factors, run_ids):
    """
    Run multiple experiments in parallel.

    Parameters
    ----------
    num_processes : int
        The maximum number of processes that can run concurrently.
    experiment_names : list of str
        The names of experiments to run.
    methods : list of str
        The methods to run the experiments under (mix1, mix2, or full).
    sparsity_factors : list of float
        The sparsity of inducing points to run the experiments at.
    run_ids : list of int
        The ids of the configurations under which to run the experiments.
    """
    # Setup an array of individual experiment configurations.
    experiment_configs = []
    for experiment in experiment_names:
        for method in methods:
            for sparsity_factor in sparsity_factors:
                for run_id in run_ids:
                    experiment_configs.append({'experiment_name': experiment,
                                               'method': method,
                                               'sparsity_factor': sparsity_factor,
                                               'run_id': run_id})

    # Now run the experiments.
    pool = Pool(num_processes)
    pool.map(run_config, experiment_configs)
Beispiel #14
0
def parallel_main():
    recs = iter(sys.stdin.readlines())
    cuts_list = []
    cuts_list_append = cuts_list.append
    cuts = []
    cuts_extend = cuts.extend
    cuts_append = cuts.append
    cuts_clear = cuts.clear

    while True:
        # length of stick
        L = int(next(recs))
        if L == 0:
            break

        # number of cut
        n_cut = int(next(recs))
        # cutting points
        cuts_clear()
        cuts_append(0)
        cuts_extend(list(map(int, next(recs).split())))
        cuts_append(L)
        cuts_list_append(cuts[:])

    p = Pool(4)
    results = p.map(min_cut, cuts_list)
    for res in results:
        print(res)
def runexp(n_topics_lst, n_vocab_lst, n_words_lst, alpha_lst, beta_lst,
           n_docs, n_runs, tol=.05):
    kwargsin = locals()
    params = it.product(n_topics_lst, n_vocab_lst, n_words_lst, alpha_lst,
                        beta_lst)

    args = []
    for i, (n_topics, n_vocab, n_words, α, β) in enumerate(params):
        args.append([n_topics, n_vocab, n_words, α, β, n_docs, tol])

    args = [tuple(arg + [random.randrange(2**31), i]) for
            i, arg in enumerate(args*n_runs)]

    pool = Pool()
    res = pool.map(_expitr, args)

    data = []
    for t, n, n_topics, n_vocab, n_words, α, β, n_docs in res:
        datum = {'time': t,
                 'Number of samples': n,
                 'Number of topics': n_topics,
                 'Vobabulary size': n_words,
                 'Number of words': n_words,
                 'Number of documents': n_docs,
                 'α': α,
                 'β': β, }
        data.append(datum)

    res = {
        'df': pandas.DataFrame(data),
    }
    return {'res': res, 'args': kwargsin}
Beispiel #16
0
	def __init__(self, processes=None, initializer=None, initargs=(), process=None):
		"""
		@param process: Process subclass to use
		"""
		if process is not None:
			self.Process = process
		Pool.__init__(self, processes, initializer, initargs)
def main_mh():
    samples_dir_p = Path("/RECH2/huziy/BC-MH/bc_mh_044deg/Samples")

    out_dir_root = Path("/RECH2/huziy/MH_streamflows/")


    if samples_dir_p.name.lower() == "samples":
        out_folder_name = samples_dir_p.parent.name
    else:
        out_folder_name = samples_dir_p.name


    varnames = ["STFA", ]

    # ======================================

    out_dir_p = out_dir_root.joinpath(out_folder_name)

    if not out_dir_p.is_dir():
        out_dir_p.mkdir(parents=True)


    inputs = []
    for y in range(1981, 2010):
        inputs.append(dict(year=y, varnames=varnames, samples_dir=samples_dir_p, out_dir=out_dir_p, target_freq_hours=24))

    # Extract the data for each year in parallel
    pool = Pool(processes=3)
    pool.map(extract_data_for_year_in_parallel, inputs)
Beispiel #18
0
def main():
	global pool
	pool = Pool(POOL_SIZE)
	
	
	nseeds = 100
	
#	print("== generating seeds...")
#	generate_seeds(nseeds)
	
	#print("running const density experiments...")
	#run_constant_density(0.1, range(100, 1000, 100), nseeds)
	
	#print("running const size experiments...")
	#run_constant_size(50, range(100, 1000, 100), nseeds)
	
	print("== running aggregate interval experiments (const density)...")
#	run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [100, 500] + list(range(1000, 4000, 1000)))

	run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.2, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.3, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.4, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.5, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])

	pool.close()
	pool.join()
def runexp(filename, n_topics_range, β=.1, n_samples=1000, n_runs=8,
           seed=1337):
    kwargsin = locals()
    docs, n_words = get_imdb_docs(filename)
    random.seed(seed)

    args = []
    pool = Pool()
    for tid, (_, n_topics) in enumerate(product(range(n_runs), n_topics_range)):
        seed_i = random.randrange(2**31)
        args.append((docs, n_topics, n_words, β, n_samples, seed_i, tid,))

    data = []
    runtime = []
    res = pool.map(_expitr, args)
    for evx, rt, nt in res:
        data.append({'Log evidence': evx,
                     'Runtime': rt,
                     'Number of topics': nt})

    out = {
        'res': {
            'data': pd.DataFrame(data),
        },
        'args': kwargsin,
    }
    return out
Beispiel #20
0
def multi_main(syntype,stpYN,inj):
    from multiprocessing.pool import Pool
    p = Pool(12,maxtasksperchild=1)
    # Apply main simulation varying cortical fractions:
    stimfreqs=[1,5,10,20,40]
    params=[(freq,syntype,stpYN,inj) for freq in stimfreqs]
    results = p.map(moose_main,params)
    return dict(zip(stimfreqs,results))
Beispiel #21
0
 def parallel(self):
     self.getInput()
     p = Pool(4)
     millis1 = int(round(time.time() * 1000))
     self.results = p.map(solve, self.input)
     millis2 = int(round(time.time() * 1000))
     print("Time in milliseconds: %d " % (millis2 - millis1))
     self.makeOutput()
Beispiel #22
0
    def run_parallel(n_process):
        """
        Creates a process for each element in the array returned by ``get_configs()`` and the experiment corresponding
        the each element. The maximum number of processes to run in parallel is determined by ``n_process``
        """

        p = Pool(n_process)
        p.map(run_config, ExperimentRunner.get_configs())
def main():
    # update_item_list(SQL_USER, SQL_PASS, SQL_DATABASE)
    engine = create_engine('mysql+mysqlconnector://%s:%s@localhost/%s' % (SQL_USER, SQL_PASS, SQL_DATABASE))
    region_id = 10000002
    item_id_list = [int(index) for (index, row) in pd.read_sql_table('items', engine, index_col='item_id').iterrows()]
    data_write = partial(update_price_data, region_id)
    p = Pool(initializer=init_function, initargs=(SQL_USER, SQL_PASS, SQL_DATABASE))
    p.map(data_write, item_id_list)
def main_crcm5_hl():
    label = "CRCM5_HL"

    period = Period(
        datetime(1980, 1, 1), datetime(2009, 12, 31)
    )


    pool = Pool(processes=12)

    input_params = []
    for month_start in period.range("months"):

        month_end = month_start.add(months=1).subtract(seconds=1)

        current_month_period = Period(month_start, month_end)
        current_month_period.months_of_interest = [month_start.month, ]


        vname_to_level_erai = {
            T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
            U_WE: VerticalLevel(1, level_kinds.HYBRID),
            V_SN: VerticalLevel(1, level_kinds.HYBRID),
        }

        vname_map = {}
        vname_map.update(vname_map_CRCM5)

        vname_map = {}
        vname_map.update(vname_map_CRCM5)
        vname_map.update({
            default_varname_mappings.SNOWFALL_RATE: "U3"
        })

        label_to_config = OrderedDict([(
            label, {
                DataManager.SP_BASE_FOLDER: "/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected",
                DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME,
                DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map,
                DataManager.SP_LEVEL_MAPPING: vname_to_level_erai,
                DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5,
                DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5,
                "out_folder": "lake_effect_analysis_{}_{}-{}_monthly".format(label, period.start.year, period.end.year)
            }
        )])

        kwargs = dict(
            label_to_config=label_to_config,
            period=current_month_period,
            months_of_interest=current_month_period.months_of_interest,
            nprocs_to_use=1
        )

        print(current_month_period.months_of_interest)
        input_params.append(kwargs)

    # execute in parallel
    pool.map(monthly_func, input_params)
Beispiel #25
0
def sanity_run_splitter(uncr_bin, config_list, input_files, formatted_files,
                        langs, tmp_dir, jobs):
    """
    writes config option into a file and tests if every input file is formatted
    so that is matches the content of the according expected file


    Parameters
    ----------------------------------------------------------------------------
    :param uncr_bin: str
        path to the Uncrustify binary

    :param config_list: list< tuple< str, str > >
        a list of tuples containing option names and values

    :param input_files: list / tuple< str >
        a list containing paths to a files that are going to be formatted

    :param formatted_files: list / tuple< str >
        a list containing paths to files containing the expected contents

    :param langs: list / tuple< str > / None
        a list of languages the files, used as Uncrustifys -l argument
        can be None or shorter than the amount of provided files

    :param tmp_dir: str
        the directory in which the config files will be written to

    :param jobs: int
        number of processes to use


    :return: bool
    ----------------------------------------------------------------------------
        True if all files generate correct results, False oterhwise
    """

    file_len = len(input_files)
    if len(formatted_files) != file_len:
        raise Exception("len(input_files) != len(formatted_files)")

    gen_cfg_path = path_join(tmp_dir, "gen.cfg")
    with open(gen_cfg_path, 'w') as f:
        print_config(config_list, target_file_obj=f)

    lang_max_idx = -1 if langs is None else len(langs) - 1
    args = []

    for idx in range(file_len):
        lang = None if idx > lang_max_idx else langs[idx]

        args.append((formatted_files[idx], uncr_bin, gen_cfg_path,
                     input_files[idx], lang))

    pool = Pool(processes=jobs)
    sr = pool.map(sanity_run, args)

    return False not in sr
def main_crcm5_nemo():
    label = "CRCM5_NEMO"

    period = Period(
        datetime(1980, 1, 1), datetime(2015, 12, 31)
    )


    pool = Pool(processes=10)

    input_params = []
    for month_start in period.range("months"):

        month_end = month_start.add(months=1).subtract(seconds=1)

        current_month_period = Period(month_start, month_end)
        current_month_period.months_of_interest = [month_start.month, ]


        vname_to_level_erai = {
            T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
            U_WE: VerticalLevel(1, level_kinds.HYBRID),
            V_SN: VerticalLevel(1, level_kinds.HYBRID),
        }



        vname_map = {}
        vname_map.update(vname_map_CRCM5)

        vname_map = {}
        vname_map.update(vname_map_CRCM5)
        vname_map.update({
            default_varname_mappings.SNOWFALL_RATE: "SN"
        })

        label_to_config = OrderedDict([(
            label, {
                DataManager.SP_BASE_FOLDER: "/snow3/huziy/NEI/GL/erai0.75deg_driven/GL_with_NEMO_dtN_1h_and_30min/Samples",
                DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT,
                DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map,
                DataManager.SP_LEVEL_MAPPING: vname_to_level_erai,
                DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5,
                DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5,
                DataManager.SP_VARNAME_TO_FILENAME_PREFIX_MAPPING: default_varname_mappings.vname_to_fname_prefix_CRCM5,
                "out_folder": "lake_effect_analysis_{}_{}-{}_monthly".format(label, period.start.year, period.end.year)
            }
        )])

        kwargs = dict(
            label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1
        )

        print(current_month_period.months_of_interest)
        input_params.append(kwargs)

    # execute in parallel
    pool.map(monthly_func, input_params)
Beispiel #27
0
 def run(self):
     if(self.fileName[0] != None and self.fileName[0] !=""):
         # p = Process(target=self.creationPDF, args=(self.fileName[0],))
         # self.creationPDF(self.fileName[0])
         # p.start()
         pool = Pool(processes=4)  # start 4 worker processes
         result = pool.apply_async(self.creationPDF, [self.fileName[0]])
     else:
         print("Sauvegarde annule")
def run_jar_new_thread(config_path, **kwargs):
    """
    kwargs - arguments dictionary of run_jar()
    """
    pool = Pool(processes=1)
    __import__("ipdb").set_trace()
    result = pool.apply_async(_run_jar_with_config, [config_path], kwargs['kwargs'])
    job_id = result.get()
    return job_id
Beispiel #29
0
 def f():
     setting = Setting().parse_args(argv, stdout)
     n = min(len(setting.tasks), setting.parallelism)
     if n <= 1:
         ret = map(run_task, setting.tasks)
     else:
         pool = Pool(n)
         ret = pool.map(run_task, setting.tasks)
     return max(ret)
def main_future(nprocs=20):

    period = Period(
        datetime(2079, 1, 1), datetime(2100, 12, 31)
    )

    label = "CRCM5_NEMO_fix_TT_PR_CanESM2_RCP85_{}-{}_monthly".format(period.start.year, period.end.year)

    vname_to_level_erai = {
        T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
        U_WE: VerticalLevel(1, level_kinds.HYBRID),
        V_SN: VerticalLevel(1, level_kinds.HYBRID),
    }

    base_folder = "/scratch/huziy/Output/GL_CC_CanESM2_RCP85/coupled-GL-future_CanESM2/Samples"

    vname_map = {}
    vname_map.update(vname_map_CRCM5)
    # vname_map[default_varname_mappings.SNOWFALL_RATE] = "SN"
    vname_map[default_varname_mappings.SNOWFALL_RATE] = "XXX"



    pool = Pool(processes=nprocs)

    input_params = []
    for month_start in period.range("months"):

        month_end = month_start.add(months=1).subtract(seconds=1)

        current_month_period = Period(month_start, month_end)
        current_month_period.months_of_interest = [month_start.month, ]

        label_to_config = OrderedDict([(
            label, {
                # "base_folder": "/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-future_CanESM2/Samples",
                DataManager.SP_BASE_FOLDER: base_folder,
                DataManager.SP_DATASOURCE_TYPE: data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT,
                DataManager.SP_INTERNAL_TO_INPUT_VNAME_MAPPING: vname_map,
                DataManager.SP_LEVEL_MAPPING: vname_to_level_erai,
                DataManager.SP_OFFSET_MAPPING: vname_to_offset_CRCM5,
                DataManager.SP_MULTIPLIER_MAPPING: vname_to_multiplier_CRCM5,
                DataManager.SP_VARNAME_TO_FILENAME_PREFIX_MAPPING: vname_to_fname_prefix_CRCM5,
                "out_folder": "lake_effect_analysis_{}_{}-{}".format(label, period.start.year, period.end.year)
            }
        )])

        kwargs = dict(
            label_to_config=label_to_config, period=current_month_period, months_of_interest=current_month_period.months_of_interest, nprocs_to_use=1
        )

        print(current_month_period.months_of_interest)
        input_params.append(kwargs)

    # execute in parallel
    pool.map(monthly_func, input_params)
Beispiel #31
0
        x, y = x
        time.sleep(3)
        print(os.getpid(), x, y, x + y)
        return x

    def testresult(self):
        poola = Pool(processes=3)
        results = poola.map(funca, [[1, 2], (2, 3), (3, 4)])
        print(results)

    def testresult2(self):
        print(os.getpid())
        results = list()
        # oooa = OOO(3)
        timestart = time.time()
        poola = Pool(processes=2)
        for i in range(5):
            result = self._poola.map(AAAAA.ooofunc, [(1, 2), (2, 3), (4, 4)])
            results.append(result)
        for result in results:
            result.wait()
        # time.sleep(100)
        # print (result.get())
        timeend = time.time()
        print(timeend - timestart)


if __name__ == '__main__':
    # testresult2()
    AAAAA(Pool(3), 3).testresult2()
Beispiel #32
0
 def apply_async(self, func, args=(), kwds={}, callback=None):
     return Pool.apply_async(self, LogExceptions(func), args, kwds,
                             callback)
Beispiel #33
0
def read_files_batched(filenames,
                       file_batch_size=8192,
                       file_batch_shuffle=False,
                       max_batches=math.inf,
                       return_mode='array',
                       n_jobs=-1,
                       max_batches_in_queue=1000,
                       max_queue_wait_seconds=0.5,
                       pd_kwargs={}):
    """Read multiple files in parallel."""
    def listify_generator(func, *args, **kwargs):
        listified_generator = list(func(*args, **kwargs))
        return (listified_generator)

    if n_jobs == -1:
        n_jobs = cpu_count() - 1
        n_jobs = min((n_jobs, len(filenames)))

    # Parallel
    if n_jobs > 1:

        # Batch queue, appended in callback
        batch_queue = deque(maxlen=max_batches_in_queue)

        def callback(batch):
            while True:
                if len(batch_queue) < max_batches_in_queue:
                    batch_queue.append(batch)
                    break
                else:
                    time.sleep(0.1)

        # Create processes
        p = Pool(n_jobs)
        for filename in filenames:
            p.apply_async(listify_generator, (read_file_batched, filename),
                          dict(file_batch_size=file_batch_size,
                               file_batch_shuffle=file_batch_shuffle,
                               max_batches=max_batches,
                               return_mode=return_mode,
                               pd_kwargs=pd_kwargs),
                          callback=callback)

        # Yield from queue
        keep_trying = True
        last_non_empty_batch = None
        while keep_trying:
            if len(batch_queue) > 0:
                for batch in batch_queue.popleft():
                    yield batch
                last_non_empty_batch = time.clock()

            if len(batch_queue) == 0:
                if last_non_empty_batch is not None:
                    if time.clock(
                    ) - last_non_empty_batch >= max_queue_wait_seconds:
                        keep_trying = False
        p.close()
        p.join()

    # Single process
    else:
        for filename in filenames:
            for batch in read_file_batched(
                    filename,
                    file_batch_size=file_batch_size,
                    file_batch_shuffle=file_batch_shuffle,
                    max_batches=max_batches,
                    return_mode=return_mode,
                    pd_kwargs=pd_kwargs):
                yield batch

def prime_factor(values):
    factors = []
    for divisor in range(2, value-1):
        quotient, remainder = divmod(value, divisor)
        if not remainder:
            factors.extend(prime_factor(divisor))
            factors.extend(prime_factor(quotient))
    else:
        factors = [value]
    return factors


if __name__ == '__main__':
    pool = Pool()  # defaults to create cpu_count() processes
    to_factor = [
        random.randint(100000, 50000000) for _ignored in range(20)
    ]
    # pool pickles each element in to_factor, pass to available process, which executes
    # when done, result is pickled and passed back to pool.
    # after all done, results is passed back to original process, which has been waiting
    # pools also have map_async method, which calls results.get() when done
    # async with other methods like ready() and wait()
    # also apply_async method to queue up jobs
    # pools can also be close(), then all further tasks are refused
    # or terminate(), which also ignores all waiting jobs in pool,
    # (but running jobs are allowed to complete.)
    results = pool.map(prime_factor, to_factor)
    for value, factors in zip(to_factor, results):
        print('The factors of {} are {}'.format(value, factors))
class EvolutionaryStrategy():
    '''
    EvolutionStrategy trainer
    '''

    evolution = 0
    _previous_scores = -1

    def __init__(self,
                 name,
                 model_function,
                 env_function,
                 population_size=50,
                 mutation=3.0,
                 mutation_rate=0.80,
                 mutation_decay=True,
                 mutation_decay_rate=0.99,
                 variable_crossed_progeny=True,
                 selection_cutoff=0.20,
                 selection_cutoff_decay=True,
                 selection_cutoff_decay_rate=0.95,
                 test_episodes=5):
        '''
        Constructor

        Arguments:
            name -- model name
            model_function -- function which returns the desired keras model
            env_function -- function which returns a new environment
            population_size -- size of the population to breed
            mutation -- amount of possible mutation for a given feature
            mutation_rate -- how many possible features to mutate
            mutation_decay -- decay the mutation_rate and mutation?
            mutation_decay_rate -- decay rate for mutation_rate and mutation
            variable_crossed_progeny -- non uniform dependence of progeny on parents,
                                        based on their individual performance
            selection_cutoff -- selection_cutoff
            selection_cutoff_decay -- decay the number of candidates selected from
                                        the given population?
            selection_cutoff_decay_rate -- decay rate for the selection_cutoff
            test_episodes -- number of episodes to evaluate the given candidate over
        '''

        self.name = name
        self.env_function = env_function

        self.model_file, self.weights_file = _create_model(
            self.name, model_function, self.env_function)

        self.population_size = population_size

        self.variable_crossed_progeny = variable_crossed_progeny
        self.test_episodes = test_episodes

        self.mutation_rate = mutation_rate
        self.mutation_decay_rate = mutation_decay_rate if mutation_decay else 1
        self.mutation = mutation

        self.selection_cutoff_decay_rate = selection_cutoff_decay_rate \
            if selection_cutoff_decay else 1
        self.selection_cutoff = selection_cutoff

        self.pool = Pool(processes=mp.cpu_count())

        args = [[self.weights_file]]
        results = []
        for result in self.pool.imap(load_worker, args):
            results.append(result)
        weights = results[0]

        self.population = []
        for _ in range(self.population_size):
            candidate = []
            for weight in weights:
                candidate.append(np.random.randn(*weight.shape))
            self.population.append(candidate)
        self.n_layers = len(self.population[0])

        os.remove(self.weights_file)

    def evolve_step(self, return_population=False):
        '''
        Complete one evolution step. Include selection, breeding, and mutation

        Arguments:
            return_population -- return the complete evolved population

        Returns
            return_population is True -- complete evolved population,
                                            average performance of evolution
            return_population is False -- top performing candidate,
                                            average performance of evolution
        '''

        self.evolution += 1
        print('EVOLUTION {}'.format(self.evolution))

        print('\tselecting from population...')
        if self.evolution - 1:
            self.selection_cutoff *= self.selection_cutoff_decay_rate
            self.mutation_rate *= self.mutation_decay_rate
            self.mutation *= self.mutation_decay_rate

        n_selected = int(self.population_size * self.selection_cutoff)

        if not isinstance(self._previous_scores, int):
            scores = self._previous_scores
        else:
            scores = self._evaluate(self.population)

        lucky_factor = 0.20
        top_selection, bottom_selection = int(
            n_selected * (1 - lucky_factor)), int(n_selected * lucky_factor)
        scores, scrap = scores[:n_selected], scores[n_selected:]
        scores = scores[np.random.choice(np.arange(n_selected), top_selection)]
        scrap = scrap[np.random.choice(
            np.arange(self.population_size - n_selected), bottom_selection)]
        selected_candidates = np.vstack((scores, scrap))

        selected_population = []
        for index in np.array(selected_candidates[:, 0], dtype=np.int32):
            selected_population.append(self.population[index])

        print('\tbreeding from selected population...')
        n_bred = self.population_size
        progeny = self._breed(selected_population, n_bred)

        if isinstance(progeny, int):  # none survived
            progeny = self.population

        print('\tevaluating progeny...')
        self.population = progeny
        generation_evaluation = self._evaluate(self.population)
        best_performance = generation_evaluation[0, 1]
        average_performance = np.average(generation_evaluation[:, 1])

        print('evolution {}: top_generation_performance = {}, '
              'average_generation_performance = {}'.format(
                  self.evolution, best_performance, average_performance))

        self._previous_scores = generation_evaluation

        if not return_population:
            return self.population[int(generation_evaluation[0, 0])], \
                average_performance
        else:
            return self.population, average_performance

    def _evaluate(self, population, test_episodes=None):
        '''
        Evaluate the given population

        Arguments:
            population -- the population to evaluate
            test_episodes -- the number of episodes to evaluate each candidate over

        Returns:
            sorted list of [average_score, index] for each candidate in the population
        '''

        if not test_episodes:
            test_episodes = self.test_episodes

        args = []
        for index, candidate in enumerate(population):
            args.append([
                candidate, self.model_file, self.env_function, test_episodes,
                index
            ])

        scores = []
        for result in self.pool.imap_unordered(evaluate_worker, args):
            scores.append(result)

        scores = np.array(scores)
        scores = scores[scores[:, 1].argsort(axis=0)][::-1]

        return scores

    def _breed(self, population, progeny_to_generate):
        '''
            Breed within the given population to generate progeny.
            Involves crossing, mutating.

            Arguments:
                population -- the population to breed within
                progeny_to_generate -- the number of progeny to generate

            Returns:
                the bred (cross, mutated) progeny
        '''

        if len(population) == 0:
            return -1

        args = []
        for _ in range(progeny_to_generate):
            left, right = np.random.choice(len(population), 2)
            parents = [population[left], population[right]]
            args.append([
                parents, self.n_layers, self.mutation_rate, self.mutation,
                self.variable_crossed_progeny, self.model_file,
                self.env_function, self.test_episodes
            ])

        bred = []
        for progeny in self.pool.imap_unordered(breed_worker, args):
            bred.append(progeny)

        return np.array(bred)

    def _mutate(self, progeny):
        '''
        Mutates the given progeny

        Arguments:
            progeny -- progeny to mutate

        Returns:
            the mutated progeny
        '''

        for layer in np.random.choice(
                np.arange(self.n_layers),
                int(np.random.sample() * (self.n_layers))):
            original_shape = np.array(progeny[layer]).shape
            flat = np.reshape(progeny[layer], -1)

            padded_layer = False
            if len(original_shape) == 1:
                original_shape += (1, )
                padded_layer = True

            layer_mutations = int(self.mutation_rate *
                                  np.multiply(*original_shape))
            while layer_mutations != 0:
                chromosome_index = np.random.randint(len(flat))
                chromosome = flat[chromosome_index]

                low = chromosome - self.mutation * chromosome
                high = chromosome + self.mutation * chromosome
                flat[chromosome_index] = (high -
                                          low) * np.random.sample() + low
                layer_mutations -= 1

            if padded_layer:
                original_shape = original_shape[:-1]

            progeny[layer] = flat.reshape(original_shape)

        return progeny

    def _crossover(self, parents):
        '''
        Crossovers the given parents to generate a progeny

        Arguments:
            parents -- [parent1, parent2] to cross over

        Returns:
            the crossed progeny
        '''

        if not self.variable_crossed_progeny:
            crossover_rate = 0.50
            crossover_p = [crossover_rate, 1 - crossover_rate]
        else:
            scores = self._evaluate(parents)
            if scores[0, 0] == 0:
                p_left, p_right = scores[0, 1], scores[1, 1]
            else:
                p_left, p_right = scores[1, 1], scores[0, 1]

            crossover_p = [
                p_left / (p_left + p_right), p_right / (p_left + p_right)
            ]

        left, right = parents[0], parents[1]

        progeny = []
        for layer, _ in enumerate(left):
            original_shape = np.array(left[layer]).shape
            left_flat, right_flat = np.reshape(left[layer], -1), np.reshape(
                right[layer], -1)

            progeny_layer = []
            for index, _ in enumerate(left_flat):
                chance = np.random.choice(2, p=crossover_p)

                if chance:
                    progeny_layer.append(right_flat[index])
                else:
                    progeny_layer.append(left_flat[index])

            progeny.append(np.reshape(progeny_layer, original_shape))

        return progeny

    def performance(self, candidate, test_episodes=50, get_rewards=False):
        '''
        Evaluate the performance of the given candidate over the given episodes

        Arguments:
            candidate -- the candidate to evaluate
            test_episodes -- the number of episodes to evaluate the candidate over
            get_rewards -- return all the rewards?

        Returns:
            get_rewards is False -- None
            get_rewards is True -- rewards of the candidate over test_episodes
        '''

        args = [[candidate, self.model_file, self.env_function, test_episodes]]

        rewards = []
        for result in self.pool.imap_unordered(performance_worker, args):
            rewards.append(result)

        print('\nmodel_average_performance over {} episodes = {}'.format(
            test_episodes, np.average(rewards)))
        print('model_peak_performance = {}, model_min_performance = {}'.format(
            np.max(rewards), np.min(rewards)))

        if get_rewards:
            return rewards
Beispiel #36
0
def spider():
    url_data = get_all_urls()
    pool = Pool(10)
    pool.map(parse, url_data)
Beispiel #37
0
    'Retrieve a target url and return the download status as a string'
    filename = target.rsplit('/', 1)[-1]
    fullname = os.path.join(dirname, filename)
    r = urlretrieve(target, fullname, etags)
    if r.code != 200:
        return '%3d  %-16s %s' % (r.code, r.msg, target)
    compressed = '*' if r.compressed else ' '
    written = '(updated)' if r.written else '(current)'
    return '%3d%1s %-16s %-55s --> %-25s %s ' % \
           (r.code, compressed, r.msg, target, fullname, written)


if __name__ == '__main__':
    try:
        os.mkdir(dirname)
    except OSError:
        pass

    links_url = 'https://dl.dropboxusercontent.com/u/3967849/%s/links.txt' % class_id
    print((' Source: %s ' % links_url).center(117, '='))
    print((' Starting download at %s ' % time.ctime()).center(117))

    etags = dbm.dumb.open(os.path.join(dirname, 'etag_db'))
    links_text = urllib.request.urlopen(links_url).read().decode('utf-8')
    targets = re.findall(r'^http(?:s?)://\S+', links_text, re.M)
    mapper = Pool(25).imap_unordered
    mapper = map
    for line in mapper(download, targets):
        print(line)
    etags.close()
Beispiel #38
0
              'REMAINING', duration_to_string(remaining_time),
               'TOTAL', duration_to_string(total_time))

        frogged_filename = util.filename_without_extension(filename, '.txt')

        with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f:
            f.write(output)

if __name__ == '__main__':
    INPUT_FOLDER = '../data/plaintext/'
    OUTPUT_FOLDER = '../data/frogged/'

    files = util.todo_filepaths(INPUT_FOLDER, '.txt', OUTPUT_FOLDER, '.frog.out')
    files = sorted(files)[::-1]
    if os.path.exists('../data/frog_todo.p'):
        print ("USING FROG TODO!")
        with open('../data/frog_todo.p', 'rb') as f:
            files = pickle.load(f)
            files = [s.replace('\\','/') for s in files]


    n_processes = 2
    print ("N_CPU", util.CPU_COUNT, " N PROCESSES", n_processes)

    file_chunks = util.split(files, n_processes)

    pool = Pool(processes=n_processes)
    pool.map(frog_process_files, file_chunks)
    pool.join()
    pool.close()
                      '/wiki/Special:Export/Template:Periodic_table')
    categories = []
    params = []
    for row in article.get_table('table 1'):
        for key, value in row.items():
            segments = [segment.strip() for segment in value.split(';')]
            if len(segments) >= 7:
                if segments[5].lower() not in categories:
                    categories.append(segments[5].lower())
                params.append(
                    (segments[1], segments[7].replace(' ', '_')
                     if len(segments) > 7 else segments[1].capitalize(),
                     ionization_energies, element_names,
                     categories.index(segments[5].lower())))

    pool = Pool(processes=multiprocessing.cpu_count() * 2)

    json_data = pool.starmap(parse, params)
    pool.close()
    pool.join()

    # Save

    json_data.sort(key=lambda k: int(k['number']))

    with open(OUTPUT_JSON, 'w+') as outfile:
        json.dump(json_data,
                  outfile,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False)
Beispiel #40
0
            file_path = '{0}/{1}.{2}'.format(item.get('title'),
                                             md5(response.content).hexdigest(),
                                             'jpg')
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    f.write(response.content)

            else:
                print('已经下载了')

    except requests.ConnectionError:
        print('保存图片失败')


def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)


GROUP_START = 1
GROUP_END = 20

if __name__ == '__main__':
    pool = Pool()
    groups = (x * 20 for x in range(GROUP_START, GROUP_END + 1))
    pool.map(main, groups)
    pool.close()
    pool.join()
Beispiel #41
0
    return counter


def shard_batches(shard):
    for entry in os.scandir(shard):
        if entry.name.isdigit():
            yield entry.path


def shard_size_per_domain(shard, pool):
    # Since 3.8 you can do this with sum(..., start=Counter()) I think?
    totals = Counter()
    for counter in pool.imap_unordered(batch_size_per_domain,
                                       shard_batches(shard)):
        totals += counter
    return totals


pool = Pool(8)

for shard in sys.argv[1:]:
    totals = shard_size_per_domain(shard, pool)
    # for domain, size in totals.most_common():
    # 	print("{}\t{}".format(domain, size))

    with gzip.open(os.path.join(shard, 'sizes.gz'), 'wb') as fh, \
    TextIOWrapper(fh) as fout:
        for domain, size in totals.most_common():
            print("{}\t{}".format(domain, size), file=fout)
        print(f"failure to download or save {str(fname_out)}")


# ### list of GCMs
list_GCMs = ['ECMWF', 'UKMO', 'METEO_FRANCE', 'CMCC', 'DWD']

# ### variable name
varname = 'T2M'

# year = 2007
# months = list(range(8, 13))

# for month in months:
#     # constructs the list of arguments for the function `fetch_CDS`
#     args = [(GCM, year, month, varname, opath) for GCM in list_GCMs]
#     # initialise the pool of workers
#     p = Pool(len(args))
#     # send the function to the workers
#     p.map(fetch_CDS, args)

### loop over the years and months

for year in range(2007, 2016 + 1):
    for month in range(1, 12 + 1):
        # constructs the list of arguments for the function `fetch_CDS`
        args = [(GCM, year, month, varname, opath) for GCM in list_GCMs]
        # initialise the pool of workers
        p = Pool(len(args))
        # send the function to the workers
        p.map(fetch_CDS, args)
Beispiel #43
0
def parallel(team):
    # k, df = group
    df = pd.read_csv("e_league_extracted.csv")
    df = df[((df['HomeTeam'] == team) | (df['AwayTeam'] == team))]
    df['Date'] = pd.to_datetime(df['Date'])

    days1 = [10, 20, 30, 45]
    days = list(range(60, 800, 30))
    days = days1 + days
    timeDeltaObjs = [pd.Timedelta(days=x) for x in days]
    for index, row in df.iterrows():
        if team == row.HomeTeam:
            for idx in range(len(days)):
                cond = ((team == row.HomeTeam) & (row.FTR == "H")) & (
                    df.Date >
                    (row.Date - timeDeltaObjs[idx])) & (df.Date < row.Date)
                df.loc[index, "home_count_days_" + str(days[idx])] = sum(cond)
        if team == row.AwayTeam:
            for idx in range(len(days)):
                cond = ((team == row.AwayTeam) & (row.FTR == "A")) & (
                    df.Date >
                    (row.Date - timeDeltaObjs[idx])) & (df.Date < row.Date)
                df.loc[index, "away_count_days_" + str(days[idx])] = sum(cond)
    return df


# df.to_csv("e_league_feature_extracted.csv", index = False)
with Pool(40) as pool:
    result = pd.concat(pool.map(parallel, intersectList))

result.to_csv("e_league_feature_extracted.csv", index=False)
    def __init__(self,
                 name,
                 model_function,
                 env_function,
                 population_size=50,
                 mutation=3.0,
                 mutation_rate=0.80,
                 mutation_decay=True,
                 mutation_decay_rate=0.99,
                 variable_crossed_progeny=True,
                 selection_cutoff=0.20,
                 selection_cutoff_decay=True,
                 selection_cutoff_decay_rate=0.95,
                 test_episodes=5):
        '''
        Constructor

        Arguments:
            name -- model name
            model_function -- function which returns the desired keras model
            env_function -- function which returns a new environment
            population_size -- size of the population to breed
            mutation -- amount of possible mutation for a given feature
            mutation_rate -- how many possible features to mutate
            mutation_decay -- decay the mutation_rate and mutation?
            mutation_decay_rate -- decay rate for mutation_rate and mutation
            variable_crossed_progeny -- non uniform dependence of progeny on parents,
                                        based on their individual performance
            selection_cutoff -- selection_cutoff
            selection_cutoff_decay -- decay the number of candidates selected from
                                        the given population?
            selection_cutoff_decay_rate -- decay rate for the selection_cutoff
            test_episodes -- number of episodes to evaluate the given candidate over
        '''

        self.name = name
        self.env_function = env_function

        self.model_file, self.weights_file = _create_model(
            self.name, model_function, self.env_function)

        self.population_size = population_size

        self.variable_crossed_progeny = variable_crossed_progeny
        self.test_episodes = test_episodes

        self.mutation_rate = mutation_rate
        self.mutation_decay_rate = mutation_decay_rate if mutation_decay else 1
        self.mutation = mutation

        self.selection_cutoff_decay_rate = selection_cutoff_decay_rate \
            if selection_cutoff_decay else 1
        self.selection_cutoff = selection_cutoff

        self.pool = Pool(processes=mp.cpu_count())

        args = [[self.weights_file]]
        results = []
        for result in self.pool.imap(load_worker, args):
            results.append(result)
        weights = results[0]

        self.population = []
        for _ in range(self.population_size):
            candidate = []
            for weight in weights:
                candidate.append(np.random.randn(*weight.shape))
            self.population.append(candidate)
        self.n_layers = len(self.population[0])

        os.remove(self.weights_file)
Beispiel #45
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None,
                 run_postprocessing_on_folds: bool = True):

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = load_pickle(self.dataset[k]['properties_file'])
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(
                join(self.folder_with_segs_from_prev_stage,
                     k + "_segFromPrevStage.npz"))['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate(
                (data[:-1],
                 to_one_hot(seg_from_prev_stage[0], range(1,
                                                          self.num_classes))))

            softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                data_for_net,
                do_mirroring=do_mirroring,
                mirror_axes=mirror_axes,
                use_sliding_window=use_sliding_window,
                step_size=step_size,
                use_gaussian=use_gaussian,
                all_in_gpu=all_in_gpu,
                mixed_precision=self.fp16)[1]

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in transpose_backward])

            fname = Path(properties['list_of_data_files'][0]).parts[-1][:-12]

            if save_softmax:
                softmax_fname = join(output_folder, fname + ".npz")
            else:
                softmax_fname = None
            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                              0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"

            results.append(
                export_pool.starmap_async(
                    save_segmentation_nifti_from_softmax,
                    ((softmax_pred, join(output_folder, fname + ".nii.gz"),
                      properties, interpolation_order,
                      self.regions_class_order, None, None, softmax_fname,
                      None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]

        task = Path(self.dataset_directory).parts[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples,
                             labels=list(range(self.num_classes)),
                             json_output_file=join(output_folder,
                                                   "summary.json"),
                             json_name=job_name,
                             json_author="Fabian",
                             json_description="",
                             json_task=task)

        if run_postprocessing_on_folds:
            # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
            # except the largest connected component for each class. To see if this improves results, we do this for all
            # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
            # have this applied during inference as well
            self.print_to_log_file("determining postprocessing")
            determine_postprocessing(self.output_folder,
                                     self.gt_niftis_folder,
                                     validation_folder_name,
                                     final_subf_name=validation_folder_name +
                                     "_postprocessed",
                                     debug=debug)
            # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
            # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError:
                    attempts += 1
                    sleep(1)

        self.network.train(current_mode)
        export_pool.close()
        export_pool.join()
Beispiel #46
0
 def testresult(self):
     poola = Pool(processes=3)
     results = poola.map(funca, [[1, 2], (2, 3), (3, 4)])
     print(results)
Beispiel #47
0
 def map_async(self, func, iterable, chunksize=None, callback=None):
     return Pool.map_async(self, LogExceptions(func), iterable, chunksize,
                           callback)
Beispiel #48
0
 def __init__(self, function, n_process=None):
     self.function = self._construct_filter_function(function)
     self.pool = Pool(processes=n_process)
Beispiel #49
0
def hotspider():
    pool = Pool(10)
    pool.map(parse, JSON_INDEX_URLS)
Beispiel #50
0
def interpolate_bs(kpts,
                   interp_params,
                   iband,
                   sgn=None,
                   method="boltztrap1",
                   scissor=0.0,
                   matrix=None,
                   n_jobs=1,
                   return_mass=True):
    """
    Args:
        kpts ([1x3 array]): list of fractional coordinates of k-points
        interp_params (tuple): a tuple or list containing positional
            arguments fed to the interpolation method.
            e.g. for boltztrap1:
                engre, nwave, nsym, stv, vec, vec2, out_vec2, br_dir
            and for boltztrap2:
                (equivalences, lattvec, coeffs)
        iband (int): the band index for which the list of energy, velocity
            and mass is returned. If "boltztrap2" method is used, this is the
            actual band index while if "boltztrap1" methid is used, this is the
            ith band among the bands that were included in the fit (i.e. when
            get_energy_args is called)
        sgn (float): options are +1 for valence band and -1 for conduction bands
            sgn is basically ignored (doesn't matter) if scissor==0.0
        method (str): the interpolation method. Current options are
            "boltztrap1", "boltztrap2"
        scissor (float): the amount by which the band gap is modified/scissored
        matrix (3x3 np.ndarray): the direct lattice matrix used to convert
            the velocity (in fractional coordinates) to cartesian in
            boltztrap1 method.
        n_jobs (int): number of processes used in boltztrap1 interpolation
        return_mass (bool): whether to return the effective mass values or not

    Returns (tuple of energies, velocities, masses lists/np.ndarray):
        energies ([float]): energy values at kpts for a corresponding iband
        velocities ([3x1 array]): velocity vectors
        masses ([3x3 matrix]): list of effective mass tensors
    """
    #TODO: effective mass is still inconsistent between btp1 and btp2 w/o any transformation used since it is not used in Amset ok but has to be checked with the right transformation
    if matrix is None:
        matrix = np.eye(3)
    if not sgn:
        if scissor == 0.0:
            sgn = 0.0
        else:
            raise ValueError('To apply scissor "sgn" is required: -1 or +1')
    masses = []
    if method == "boltztrap1":
        engre, nwave, nsym, nstv, vec, vec2, out_vec2, br_dir = interp_params
        energies = []
        velocities = []
        if n_jobs == 1:
            results = []
            for kpt in kpts:
                result = get_energy(kpt,
                                    engre[iband],
                                    nwave,
                                    nsym,
                                    nstv,
                                    vec,
                                    vec2,
                                    out_vec2,
                                    br_dir,
                                    return_dde=return_mass)
                results.append(result)
        else:
            inputs = [(kpt, engre[iband], nwave, nsym, nstv, vec, vec2,
                       out_vec2, br_dir) for kpt in kpts]
            with Pool(n_jobs if n_jobs != -1 else cpu_count()) as p:
                results = p.starmap(get_energy, inputs)
        for result in results:
            energy = result[0] * Ry_to_eV - sgn * scissor / 2.0
            velocity = abs(
                np.dot(matrix / np.linalg.norm(matrix), result[1]
                       )) / hbar / 0.52917721067 * A_to_m * m_to_cm * Ry_to_eV
            if return_mass:
                effective_m = 1 / (result[2] / 0.52917721067**2 *
                                   Ry_to_eV) * e / A_to_m**2 * hbar**2 / m_e
                masses.append(effective_m)
            energies.append(energy)
            velocities.append(velocity)
    elif method == "boltztrap2":
        if n_jobs != 1:
            warnings.warn(
                'n_jobs={}: Parallel not implemented w/ boltztrap2'.format(
                    n_jobs))
        equivalences, lattvec, coeffs = interp_params
        fitted = fite.getBands(np.array(kpts),
                               equivalences,
                               lattvec,
                               coeffs,
                               curvature=return_mass)
        energies = fitted[0][iband - 1] * Hartree_to_eV - sgn * scissor / 2.
        velocities = abs(
            np.matmul(matrix / np.linalg.norm(matrix),
                      fitted[1][:, iband - 1, :]).T
        ) * Hartree_to_eV / hbar * A_to_m * m_to_cm / 0.52917721067
        if return_mass:
            masses = 1 / (fitted[2][:, :, iband - 1, :].T / 0.52917721067**2 *
                          Hartree_to_eV) * e / A_to_m**2 * hbar**2 / m_e
    else:
        raise AmsetError("Unsupported interpolation method: {}".format(method))
    if return_mass:
        return energies, velocities, masses
    else:
        return energies, velocities
Beispiel #51
0
def rankspider():
    rank_urls = get_rank_urls()
    pool = Pool(5)
    pool.map(parse_rank, rank_urls)
Beispiel #52
0
def tree_search(tree, n, owner_map, disp=False):
    """ Perform MCTS search from a given position for a given #iterations """
    # Initialize root node
    if tree.children is None:
        tree.expand()

    # We could simply run tree_descend(), mcplayout(), tree_update()
    # sequentially in a loop.  This is essentially what the code below
    # does, if it seems confusing!

    # However, we also have an easy (though not optimal) way to parallelize
    # by distributing the mcplayout() calls to other processes using the
    # multiprocessing Python module.  mcplayout() consumes maybe more than
    # 90% CPU, especially on larger boards.  (Except that with large patterns,
    # expand() in the tree descent phase may be quite expensive - we can tune
    # that tradeoff by adjusting the EXPAND_VISITS constant.)

    n_workers = multiprocessing.cpu_count() if not disp else 1  # set to 1 when debugging
    global worker_pool
    if worker_pool is None:
        worker_pool = Pool(processes=n_workers)
    outgoing = []  # positions waiting for a playout
    incoming = []  # positions that finished evaluation
    ongoing = []  # currently ongoing playout jobs
    i = 0
    while i < n:
        if not outgoing and not (disp and ongoing):
            # Descend the tree so that we have something ready when a worker
            # stops being busy
            amaf_map = W*W*[0]
            nodes = tree_descend(tree, amaf_map, disp=disp)
            outgoing.append((nodes, amaf_map))

        if len(ongoing) >= n_workers:
            # Too many playouts running? Wait a bit...
            ongoing[0][0].wait(0.01 / n_workers)
        else:
            i += 1
            if i > 0 and i % REPORT_PERIOD == 0:
                print_tree_summary(tree, i, f=sys.stderr)

            # Issue an mcplayout job to the worker pool
            nodes, amaf_map = outgoing.pop()
            ongoing.append((worker_pool.apply_async(mcplayout, (nodes[-1].pos, amaf_map, disp)), nodes))

        # Anything to store in the tree?  (We do this step out-of-order
        # picking up data from the previous round so that we don't stall
        # ready workers while we update the tree.)
        while incoming:
            score, amaf_map, owner_map_one, nodes = incoming.pop()
            tree_update(nodes, amaf_map, score, disp=disp)
            for c in range(W*W):
                owner_map[c] += owner_map_one[c]

        # Any playouts are finished yet?
        for job, nodes in ongoing:
            if not job.ready():
                continue
            # Yes! Queue them up for storing in the tree.
            score, amaf_map, owner_map_one = job.get()
            incoming.append((score, amaf_map, owner_map_one, nodes))
            ongoing.remove((job, nodes))

        # Early stop test
        best_wr = tree.best_move().winrate()
        if i > n*0.05 and best_wr > FASTPLAY5_THRES or i > n*0.2 and best_wr > FASTPLAY20_THRES:
            break

    for c in range(W*W):
        owner_map[c] = float(owner_map[c]) / i
    dump_subtree(tree)
    print_tree_summary(tree, i, f=sys.stderr)
    return tree.best_move()
Beispiel #53
0
    start_page = 0
    end_page = 1
    opts, args = getopt.getopt(sys.argv[1:], "hs:e:")

    for cmd, arg in opts:
        if cmd in ("-s"):
            start_page = int(arg)
        if cmd in ("-e"):
            end_page = int(arg) + 1

    print("start_page", start_page)
    print("end_page", end_page)

    write_page_file(start_page, end_page)

    pic_list = []
    for i in range(start_page, end_page):
        pic_list += get_pic_url_list(i)

    print('获取完毕,开始下载图片...')

    start_time = time.time()
    pool = Pool(10)
    pool.map_async(download_pic, pic_list)
    pool.close()
    pool.join()

    print(error_page)
    print(f'Down done\n 耗时:{time.time() - start_time}秒')
Beispiel #54
0
def batch(frames,
          diameter,
          output=None,
          meta=None,
          processes=1,
          after_locate=None,
          **kwargs):
    """Locate Gaussian-like blobs of some approximate size in a set of images.

    Preprocess the image by performing a band pass and a threshold.
    Locate all peaks of brightness, characterize the neighborhoods of the peaks
    and take only those with given total brightness ("mass"). Finally,
    refine the positions of each peak.

    Parameters
    ----------
    frames : list (or iterable) of images
        The frames to process.
    diameter : odd integer or tuple of odd integers
        This may be a single number or a tuple giving the feature's
        extent in each dimension, useful when the dimensions do not have
        equal resolution (e.g. confocal microscopy). The tuple order is the
        same as the image shape, conventionally (z, y, x) or (y, x). The
        number(s) must be odd integers. When in doubt, round up.
    output : {None, trackpy.PandasHDFStore, SomeCustomClass}
        If None, return all results as one big DataFrame. Otherwise, pass
        results from each frame, one at a time, to the put() method
        of whatever class is specified here.
    meta : filepath or file object
        If specified, information relevant to reproducing this batch is saved
        as a YAML file, a plain-text machine- and human-readable format.
        By default, this is None, and no file is saved.
    processes : integer or "auto"
        The number of processes to use in parallel. If <= 1, multiprocessing is
        disabled. If "auto", the number returned by `os.cpu_count()`` is used.
    after_locate : function
        Specify a custom function to apply to the detected features in each
        processed frame. It must accept the following arguments:

        - ``frame_no``: an integer specifying the number of the current frame.
        - ``features``: a DataFrame containing the detected features.

        Furthermore it must return a DataFrame like ``features``.
    **kwargs :
        Keyword arguments that are passed to the wrapped `trackpy.locate`.
        Refer to its docstring for further details.

    Returns
    -------
    DataFrame([x, y, mass, size, ecc, signal])
        where mass means total integrated brightness of the blob,
        size means the radius of gyration of its Gaussian-like profile,
        and ecc is its eccentricity (0 is circular).

    See Also
    --------
    locate : performs location on a single image

    Notes
    -----
    This is a convenience function that wraps `trackpy.locate` (see its
    docstring for further details) and allows batch processing of multiple
    frames, optionally in parallel by using multiprocessing.
    """
    if "raw_image" in kwargs:
        raise KeyError("the argument `raw_image` musn't be in `kwargs`, it is "
                       "provided internally by `frames`")
    # Add required keyword argument
    kwargs["diameter"] = diameter

    if meta:
        # Gather meta information and save as YAML in current directory.
        try:
            source = frames.filename
        except AttributeError:
            source = None
        meta_info = dict(
            timestamp=pd.datetime.utcnow().strftime('%Y-%m-%d-%H%M%S'),
            trackpy_version=trackpy.__version__,
            source=source,
            **kwargs)
        if isinstance(meta, six.string_types):
            with open(meta, 'w') as file_obj:
                record_meta(meta_info, file_obj)
        else:
            # Interpret meta to be a file handle.
            record_meta(meta_info, meta)

    # Prepare wrapped function for mapping to `frames`
    curried_locate = partial(locate, **kwargs)

    # Handle & validate argument `processes`
    if processes == "auto":
        processes = None  # Is replaced with `os.cpu_count` in Pool
    elif not isinstance(processes, six.integer_types):
        raise TypeError("`processes` must either be an integer or 'auto', "
                        "was type {}".format(type(processes)))

    if processes is None or processes > 1:
        # Use multiprocessing
        pool = Pool(processes=processes)
        map_func = pool.imap
    else:
        pool = None
        map_func = map

    if after_locate is None:

        def after_locate(frame_no, features):
            return features

    try:
        all_features = []
        for i, features in enumerate(map_func(curried_locate, frames)):
            image = frames[i]
            if hasattr(image, 'frame_no') and image.frame_no is not None:
                frame_no = image.frame_no
                # If this works, locate created a 'frame' column.
            else:
                frame_no = i
                features['frame'] = i  # just counting iterations
            features = after_locate(frame_no, features)

            logger.info("Frame %d: %d features", frame_no, len(features))
            if len(features) > 0:
                # Store if features were found
                if output is None:
                    all_features.append(features)
                else:
                    output.put(features)
    finally:
        if pool:
            # Ensure correct termination of Pool
            pool.terminate()

    if output is None:
        if len(all_features) > 0:
            return pandas_concat(all_features).reset_index(drop=True)
        else:  # return empty DataFrame
            warnings.warn("No maxima found in any frame.")
            return pd.DataFrame(columns=list(features.columns) + ['frame'])
    else:
        return output
Beispiel #55
0
def rankspider():
    pool = Pool(6)
    pool.map(get_rank_urls, RANK_URL)
Beispiel #56
0
 def __init__(self, function, n_processes=None):
     self.function = _MapFunctionClosure(function)
     self.pool = Pool(processes=n_processes)
            return False

    # 下载图片信息
    def download_img(self, img_urls):
        for url in img_urls:
            img_detail = self.get_page_index(url)
            self.save_img(img_detail.content, url)

    # 保存图片到当前位置的photo文件夹
    def save_img(self, img_detail, url):
        filePath = "{0}\{1}\{2}.{3}".format(os.getcwd(), 'photo',
                                            md5(img_detail).hexdigest(), 'jpg')
        if filePath:
            with open(filePath, 'wb') as f:
                f.write(img_detail)
                f.close()
            print("正在下载:", url)


def run(num):
    spider = Spider(num, KEYWORD)
    data = spider.parse_page_index()
    urls = spider.get_urls(data)
    for url in urls:
        spider.get_img_parser(url)


if __name__ == '__main__':
    pool = Pool()
    pool.map(run, [i * 20 for i in range(START_NUM, END_NUM + 1)])
Beispiel #58
0
def tree_search(tree, n, owner_map, disp=False):

    if tree.children is None:
        tree.expand()

    n_workers = multiprocessing.cpu_count(
    ) if not disp else 1  # set to 1 when debugging
    global worker_pool
    if worker_pool is None:
        worker_pool = Pool(processes=n_workers)
    outgoing = []  # positions waiting for a playout
    incoming = []  # positions that finished evaluation
    ongoing = []  # currently ongoing playout jobs
    i = 0
    while i < n:
        if not outgoing and not (disp and ongoing):
            # Descend the tree so that we have something ready when a worker
            # stops being busy
            amaf_map = W * W * [0]
            nodes = tree_descend(tree, amaf_map, disp=disp)
            outgoing.append((nodes, amaf_map))

        if len(ongoing) >= n_workers:
            # Too many playouts running? Wait a bit...
            ongoing[0][0].wait(0.01 / n_workers)
        else:
            i += 1
            if i > 0 and i % REPORT_PERIOD == 0:
                print_tree_summary(tree, i, f=sys.stderr)

            # Issue an mcplayout job to the worker pool
            nodes, amaf_map = outgoing.pop()
            ongoing.append(
                (worker_pool.apply_async(mcplayout,
                                         (nodes[-1].pos, amaf_map, disp)),
                 nodes))

        # Anything to store in the tree?  (We do this step out-of-order
        # picking up data from the previous round so that we don't stall
        # ready workers while we update the tree.)
        while incoming:
            score, amaf_map, owner_map_one, nodes = incoming.pop()
            tree_update(nodes, amaf_map, score, disp=disp)
            for c in range(W * W):
                owner_map[c] += owner_map_one[c]

        # Any playouts are finished yet?
        for job, nodes in ongoing:
            if not job.ready():
                continue
            # Yes! Queue them up for storing in the tree.
            score, amaf_map, owner_map_one = job.get()
            incoming.append((score, amaf_map, owner_map_one, nodes))
            ongoing.remove((job, nodes))

        # Early stop test
        best_wr = tree.best_move().winrate()
        if i > n * 0.05 and best_wr > FASTPLAY5_THRES or i > n * 0.2 and best_wr > FASTPLAY20_THRES:
            break

    for c in range(W * W):
        owner_map[c] = float(owner_map[c]) / i
    dump_subtree(tree)
    print_tree_summary(tree, i, f=sys.stderr)
    return tree.best_move()
Beispiel #59
0
        dftr = pd.DataFrame({'id': ids, 'train': 'train'})
        tdftr = pd.DataFrame({'id': ids, 'train': 'test'})
        train, test = DataProcess.train_test_between_subject(
            gdata, pd.concat((dftr, tdftr)),
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        DLogger.logger().debug("total points: " + str(get_total_pionts(train)))

        worker = GQL.get_instance(2, 1, {})
        train = DataProcess.merge_data(train)
        OptML.optimise(worker,
                       output_path,
                       train,
                       test,
                       global_iters=1000,
                       learning_rate=learning_rate)


if __name__ == '__main__':

    if len(sys.argv) == 2:
        n_proc = int(sys.argv[1])
    elif len(sys.argv) == 1:
        n_proc = 1
    else:
        raise Exception('invalid argument')

    p = Pool(n_proc)
    p.map(run_BD, range(len(configs)))
    p.close()  # no more tasks
    p.join()  # wrap up current tasks
Beispiel #60
0
 def __init__(self, function, n_process=None):
     self.function = function
     self.pool = Pool(processes=n_process)