Beispiel #1
0
def determine_brats_postprocessing(folder_with_preds, folder_with_gt, postprocessed_output_dir, processes=8,
        thresholds=(0, 10, 50, 100, 200, 500, 750, 1000, 1500, 2500, 10000), replace_with=2):
    # find pairs
    nifti_gt = subfiles(folder_with_gt, suffix=".nii.gz", sort=True)

    p = Pool(processes)

    nifti_pred = subfiles(folder_with_preds, suffix='.nii.gz', sort=True)

    results = p.starmap_async(load_niftis_threshold_compute_dice, zip(nifti_gt, nifti_pred, [thresholds] * len(nifti_pred)))
    results = results.get()

    all_dc_per_threshold = {}
    for t in thresholds:
        all_dc_per_threshold[t] = np.array([i[1][t] for i in results])
        print(t, np.mean(all_dc_per_threshold[t]))

    means = [np.mean(all_dc_per_threshold[t]) for t in thresholds]
    best_threshold = thresholds[np.argmax(means)]
    print('best', best_threshold, means[np.argmax(means)])

    maybe_mkdir_p(postprocessed_output_dir)

    p.starmap(apply_brats_threshold, zip(nifti_pred, [postprocessed_output_dir]*len(nifti_pred), [best_threshold]*len(nifti_pred), [replace_with] * len(nifti_pred)))

    p.close()
    p.join()

    save_pickle((thresholds, means, best_threshold, all_dc_per_threshold), join(postprocessed_output_dir, "threshold.pkl"))
Beispiel #2
0
def harvest(out_dir,
            existing_dir=0,
            start_id=0,
            stop_id=100000,
            verbose=False):
    g_verbose = verbose
    if verbose:
        print("Begin harvesting")
    if os.path.exists(out_dir):
        if existing_dir == 0:
            exit("Directory '" + out_dir + "' exists")
        elif not os.path.isdir(out_dir):
            exit("Error: '" + out_dir + "' is not a directory")
        elif existing_dir == -1:
            rmtree(out_dir)
            os.mkdir(out_dir)
    else:
        os.mkdir(out_dir)

    #Fetch records
    arg_list = [(i, out_dir, verbose) for i in range(start_id, stop_id)]
    mp = Pool(32)
    mp.starmap(fetch_write, arg_list, chunksize=1)
    mp.close()
    mp.join()
    if verbose:
        print("Harvesting complete")
Beispiel #3
0
    def run(self, target_spacings, input_folder_with_cropped_npz, output_folder, data_identifier,
            num_threads=default_num_threads, force_separate_z=None):
        print("Initializing to run preprocessing")
        print("npz folder:", input_folder_with_cropped_npz)
        print("output_folder:", output_folder)
        list_of_cropped_npz_files = subfiles(input_folder_with_cropped_npz, True, None, ".npz", True)
        assert len(list_of_cropped_npz_files) != 0, "set list of files first"
        maybe_mkdir_p(output_folder)
        all_args = []
        num_stages = len(target_spacings)

        # we need to know which classes are present in this dataset so that we can precompute where these classes are
        # located. This is needed for oversampling foreground
        all_classes = load_pickle(join(input_folder_with_cropped_npz, 'dataset_properties.pkl'))['all_classes']

        for i in range(num_stages):
            output_folder_stage = os.path.join(output_folder, data_identifier + "_stage%d" % i)
            maybe_mkdir_p(output_folder_stage)
            spacing = target_spacings[i]
            for j, case in enumerate(list_of_cropped_npz_files):
                case_identifier = get_case_identifier_from_npz(case)
                args = spacing, case_identifier, output_folder_stage, input_folder_with_cropped_npz, force_separate_z, all_classes
                all_args.append(args)
        p = Pool(num_threads)
        p.starmap(self._run_internal, all_args)
        p.close()
        p.join()
def ExtractAllEnvelopes(LPF=False, CUTOFF=100):
    # # In case you need to print numpy outputs:
    # numpy.set_printoptions(threshold=numpy.inf, suppress=True)
    TotalTime = time.time()

    # Get all the GFB.npy files under resources/fcnn
    gfbFiles = glob.glob(join("resources", "f2cnn", "*", "*.GFB.npy"))
    print(
        "\n###############################\nExtracting Envelopes from files in '{}'."
        .format(split(gfbFiles[0])[0]))
    if LPF:
        print("Using Low Pass Filtering with a cutoff at {}Hz".format(CUTOFF))
    else:
        print("Not using Low Pass Filtering")

    if not gfbFiles:
        print(
            "ERROR: NO .GFB.npy FILES FOUND, PLEASE GENERATE FILTERED OUTPUTS")
        exit(-1)

    print(len(gfbFiles), ".GFB.npy files found")

    # Usage of multiprocessing, to reduce computing time
    proc = cpu_count()
    counter = Value('i', 0)
    multiproc_pool = Pool(processes=proc,
                          initializer=InitProcesses,
                          initargs=(counter, ))
    arguments = zip(gfbFiles, repeat(len(gfbFiles)), repeat(LPF),
                    repeat(CUTOFF))  # Pack all the arguments
    multiproc_pool.starmap(ExtractAndSaveEnvelope, arguments)

    print("Extracted Envelopes from all files.")
    print('              Total time:', time.time() - TotalTime)
    print('')
def calculate_unperturbated_empiricals(
    default_vs30,
    extended_period,
    fsf,
    im_config,
    n_processes,
    sim_root,
    empirical_im_logger: Logger = get_basic_logger(),
):
    events = load_fault_selection_file(fsf)
    empirical_im_logger.debug(
        f"Loaded {len(events)} events from the fault selection file"
    )
    events = [
        name if count == 1 else get_realisation_name(name, 1)
        for name, count in events.items()
    ]
    tasks = create_event_tasks(
        events, sim_root, im_config, default_vs30, extended_period, empirical_im_logger
    )

    pool = Pool(min(n_processes, len(tasks)))
    empirical_im_logger.debug(f"Running empirical im calculations")
    pool.starmap(calculate_empirical, tasks)
    empirical_im_logger.debug(f"Empirical ims calculated")
def process_experiment(_experiment, _overwrite=False):
    _arguments = [
        (_experiment, int(_series.split('_')[1]), _overwrite)
        for _series in paths.image_files(paths.serieses(_experiment))
    ]
    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_series, _arguments)
    _p.close()
Beispiel #7
0
def get_scores(dataset, res, gts, weights, n_threads=4):
    """
        :param dataset: ['flickrstyle', 'flickrstyle']
        :param res: ['candidate1', 'candidate2']
        :param gts: {0: ['sent1', 'sent2'], 1: ['sent3', 'sent4']}
        :param weights: {'cider': 0.5, 'bleu': 0.5}
        :return:
        """

    score = 0.

    if n_threads <= 0:      # single thread
        _dataset = dict(enumerate(dataset))
        if weights['cider'] > 0:
            score_cider = _compute_cider(_dataset, gts, res)
            score = score_cider + score
        if weights['bleu'] > 0:
            score_bleu4 = _compute_bleu(_dataset, gts, res)
            score = score_bleu4 + score
    else:                   # parallel
        def _get_chunk_index(n_samples, n_chunks):
            chunk_size = n_samples // n_chunks
            r = n_samples % n_chunks
            sizes = [chunk_size + 1 if i < r else chunk_size for i in range(n_chunks)]

            chunks = []
            i = 0
            for size in sizes:
                chunks.append(range(i, i + size))
                i += size
            return chunks

        global pool
        if pool is None:    # initialize thread pool, and initialize each thread
            pool = Pool(processes=n_threads, initializer=init_style_scorer, initargs=[_init_list])
        chunk_index = _get_chunk_index(n_samples=len(res), n_chunks=n_threads)
        chunked_args = []
        for i in range(n_threads):
            _dataset = {}
            _gts = {}
            _res = OrderedDict()
            for _i in chunk_index[i]:
                _dataset[_i] = dataset[_i]
                _gts[_i] = gts[_i]
                _res[_i] = res[_i]

            chunked_args.append([_dataset, _gts, _res])

        if weights['cider'] > 0:
            score_cider = pool.starmap(func=_compute_cider, iterable=chunked_args)
            score_cider = np.concatenate(score_cider)
            score = score_cider * weights['cider'] + score
        if weights['bleu'] > 0:
            score_bleu4 = pool.starmap(func=_compute_bleu, iterable=chunked_args)
            score_bleu4 = np.concatenate(score_bleu4)
            score = score_bleu4 * weights['bleu'] + score

    return score
Beispiel #8
0
def process_experiments(_experiments, _overwrite=False):
    _arguments = []
    for _tuple in load.experiments_serieses_as_tuples(_experiments):
        _experiment, _series_id = _tuple
        _arguments.append((_experiment, _series_id, _overwrite))

    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_series, _arguments)
    _p.close()
Beispiel #9
0
def apply_threshold_to_folder(folder_in, folder_out, threshold, replace_with, processes=24):
    maybe_mkdir_p(folder_out)
    niftis = subfiles(folder_in, suffix='.nii.gz', join=True)

    p = Pool(processes)
    p.starmap(apply_brats_threshold, zip(niftis, [folder_out]*len(niftis), [threshold]*len(niftis), [replace_with] * len(niftis)))

    p.close()
    p.join()
def process_experiments(_experiments, _pairs=True):
    _tuples = []
    for _experiment in _experiments:
        _tuples += load.experiment_groups_as_tuples(_experiment)
    _p = Pool(CPUS_TO_USE)
    if _pairs:
        _answers = _p.starmap(process_group_pairs, _tuples)
    else:
        _answers = _p.starmap(process_group_single_cells, _tuples)
    _p.close()
Beispiel #11
0
def compress_everything(output_base, num_processes=8):
    p = Pool(num_processes)
    tasks = subfolders(output_base, join=False)
    tasknames = [i.split('/')[-1] for i in tasks]
    args = []
    for t, tn in zip(tasks, tasknames):
        args.append((join(output_base, tn + ".zip"), join(output_base, t)))
    p.starmap(compress_folder, args)
    p.close()
    p.join()
Beispiel #12
0
 def update(self, export='csv', path='out'):
     stock_codes = []
     for file in os.listdir(os.path.join(path, 'raw_data')):
         if not file.endswith('csv'):
             continue
         stock_code = file[:-4]
         stock_codes.append(stock_code)
     pool = Pool(10)
     params = [(code, path) for code in stock_codes]
     if export.lower() in ['csv']:
         pool.starmap(self.update_single_code, params)
Beispiel #13
0
 def update(self, export='csv', path='out'):
     stock_codes = []
     for file in os.listdir(os.path.join(path, 'raw_data')):
         if not file.endswith('csv'):
             continue
         stock_code = file[:-4]
         stock_codes.append(stock_code)
     pool = Pool(10)
     params = [(code, path) for code in stock_codes]
     if export.lower() in ['csv']:
         pool.starmap(self.update_single_code, params)
Beispiel #14
0
def crawl_companies_files(options,
                          workers_num=10,
                          include_companies=None,
                          from_date=None):
    """

    :param driver: the panthomjs or chromium driver with the current page
                   loaded. We use the driver to navigate through the
                   listing if needed
    :param workers_num:
    :param include_companies:
    :param from_date:
    :return:
    """

    companies_files = []
    pool = Pool(processes=workers_num)

    try:
        # Obtain the ccvm codes of all the listed companies
        ccvm_codes = [
            r.ccvm for r in BovespaCompany.objects.only(["ccvm"]).all()
        ]

        ccvm_codes = sorted(ccvm_codes)

        _logger.debug("Processing the files of {0} companies from {1}".format(
            len(ccvm_codes), "{0:%Y-%m-%d}".format(from_date)
            if from_date else "THE BEGINNING"))

        func_params = []
        for ccvm_code in ccvm_codes:
            if include_companies and ccvm_code not in include_companies:
                continue

            for doc_type in DOC_TYPES:
                func_params.append([ccvm_code, options, doc_type, from_date])

        # call_results = pool.starmap(obtain_company_files, func_params)
        pool.starmap(obtain_company_files, func_params)

        # Merge all the responses into one only list
        # companies_files += list(
        #    itertools.chain.from_iterable(call_results))

    except TimeoutError:
        print("Timeout error")
        traceback.print_exc()
        raise
    finally:
        pool.close()
        pool.join()
        pool.terminate()
    def run(self,
            target_spacings,
            input_folder_with_cropped_npz,
            output_folder,
            data_identifier,
            num_threads=default_num_threads,
            force_separate_z=None):
        """

        :param target_spacings: list of lists [[1.25, 1.25, 5]]
        :param input_folder_with_cropped_npz: dim: c, x, y, z | npz_file['data'] np.savez_compressed(fname.npz, data=arr)
        :param output_folder:
        :param num_threads:
        :param force_separate_z: None
        :return:
        """
        print("Initializing to run preprocessing")
        print("npz folder:", input_folder_with_cropped_npz)
        print("output_folder:", output_folder)
        list_of_cropped_npz_files = subfiles(input_folder_with_cropped_npz,
                                             False, None, ".npz", True)
        #print("list_of_cropped_npz_files:",list_of_cropped_npz_files)
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)
        num_stages = len(target_spacings)
        if not isinstance(num_threads, (list, tuple, np.ndarray)):
            num_threads = [num_threads] * num_stages

        assert len(num_threads) == num_stages

        # we need to know which classes are present in this dataset so that we can precompute where these classes are
        # located. This is needed for oversampling foreground
        all_classes = load_pickle(input_folder_with_cropped_npz + "/" +
                                  'dataset_properties.pkl')['all_classes']

        for i in range(num_stages):
            all_args = []
            output_folder_stage = output_folder + "/" + data_identifier + "_stage%d" % i
            #print("preprocessing.run:output_folder_stage:",output_folder_stage)
            if not os.path.isdir(output_folder_stage):
                os.makedirs(output_folder_stage)
            #if not os.path.isdir(output_folder_stage):
            #os.makedirs(output_folder_stage)
            spacing = target_spacings[i]
            for j, case in enumerate(list_of_cropped_npz_files):
                case_identifier = get_case_identifier_from_npz(case)
                args = spacing, case_identifier, output_folder_stage, input_folder_with_cropped_npz, force_separate_z, all_classes
                all_args.append(args)
            p = Pool(num_threads[i])
            p.starmap(self._run_internal, all_args)
            p.close()
            p.join()
Beispiel #16
0
def convert_labels_back_to_BraTS_2018_2019_convention(input_folder: str, output_folder: str, num_processes: int = 12):
    """
    reads all prediction files (nifti) in the input folder, converts the labels back to BraTS convention and saves the
    result in output_folder
    :param input_folder:
    :param output_folder:
    :return:
    """
    maybe_mkdir_p(output_folder)
    nii = subfiles(input_folder, suffix='.nii.gz', join=False)
    p = Pool(num_processes)
    p.starmap(load_convert_save, zip(nii, [input_folder] * len(nii), [output_folder] * len(nii)))
    p.close()
    p.join()
Beispiel #17
0
def main(ext):
    # arg_list is a list of tuples, each tuple is one call to mol_to_sdf
    #arg_list = [ (i["path"], i["filename"][:-4]) for i in find_files("/".join(os.path.abspath("").split("/")[:-1]) + "/datasets/dss_tox/DSSToxAll_20151019/ToxAll/", "mol$") ]
    arg_list = [(i["path"], i["filename"][:-(len(ext) + 1)])
                for i in find_files(
                    "/".join(os.path.abspath("").split("/")[:-1]) +
                    "/datasets/activity_cliffs/", ext)]
    mp = Pool(number_of_processes)
    #mp.starmap(mol_to_sdf, arg_list, chunksize=1)
    #mp.starmap(del_mol, arg_list, chunksize=1)
    #mp.starmap(del_xyz, arg_list, chunksize=1)
    mp.starmap(mol2_to_sdf, arg_list, chunksize=1)
    #mp.starmap(del_sdf, arg_list, chunksize=1)
    mp.close()
    mp.join()
def crawl_listed_companies(options, workers_num=10):

    companies = []

    pool = Pool(processes=workers_num)

    try:
        func_params = []
        for letter in COMPANIES_LISTING_SEARCHER_LETTERS:
            func_params.append([letter, options])

        call_results = pool.starmap(
                update_listed_companies, func_params)

        # Merge all the responses into one only list
        companies += list(
            itertools.chain.from_iterable(call_results))

        return companies
    except TimeoutError:
        print("Timeout error")
        traceback.print_exc()
        raise
    finally:
        pool.close()
        pool.join()
        pool.terminate()
Beispiel #19
0
def main(keyword,page):
    pool = Pool()
    #num=[x*10 for x in range(0,page)]
    num=[[keyword,page] for page in map(lambda x :x*10,range(page))]
    numm=[[keyword,page] for page in map(lambda x :x,range(page))]
    tmp_L=pool.starmap(baidu,num)
    for x in tmp_L:
        for a in x:
            _write(str(a))
    tmp_K=pool.starmap(_360,numm)
    for k in tmp_K:
        for t in k:
            print(t)
            _write(str(t))
    pool.close()
    pool.join()
def parallel_cv_loop(func, cv, parallel=True):
    """
    Performs a parallel training loop over the cv train_idx and test_idxs.

    Example:
        - func will usually be a class that contains df, labels info but __call__ method will run a single training loop
        given train_idx, test_idx
        - This will run func.__call__(train_idx, test_idx) for each idx pair in cv and return results

    Args:
        func (object): Class that has information relating to data, labels and takes a __call__(train_idx, test_idx) to
                       run loop.
        cv (list): List of [(train_idx, test_idx), ...] pairs.
        give_cv_num (bool): Gives the cv num to the underlying function, used when using the full dataset and loading
                            precomputed arrays for a specific cv_num
        parallel (bool): Set to false for a for loop (allows for debugging)

    Return:
        (list): A list of whatever func outputs for each cv idxs.
    """
    if parallel:
        pool = Pool(len(cv))
        results = pool.starmap(
            func, cv
        )
        pool.close()
    else:
        results = []
        for args in cv:
            results.append(func(*args))

    return results
Beispiel #21
0
def main():
	getTournyIDs = []
	tournyIDs = []
	data = requests.get("https://majestic.battlefy.com/hearthstone-masters/tournaments?start={}T08:00:42.465Z&end={}T08:00:42.465Z".format(START_DATE, END_DATE)).json()
	for tourny in data:
		getTournyIDs.append(tourny['_id'])
	num_tourneys = len(getTournyIDs)
	for tourny in getTournyIDs:
		data = requests.get("https://majestic.battlefy.com/tournaments/{}/".format(tourny)).json()
		tournyIDs.append(data['stageIDs'][0])
	filepath = "csv.csv"
	print("Parsing {} tournaments using up to {} processes.".format(num_tourneys, NUM_PROCESSSES))
	p = Pool(processes = NUM_PROCESSSES)
	returnData = p.starmap(parseTournament, zip(tournyIDs, getTournyIDs, range(num_tourneys)))
	p.close()
	p.join()


	print("Parsing Complete\nStart CSV file write:")
	with open(filepath, "a", newline='\n', encoding='utf-8') as csvfile:
		if os.stat(filepath).st_size == 0:
			csvfile.write(",K,D,D,D") #TournyID, Name, Deck, Deck, Deck
		for processString in returnData:
			split = processString.split(",")
			for line in split:
				csvfile.write("{}".format(line))
				csvfile.write(",")
			

	print("Finish CSV File Write")
Beispiel #22
0
def add_demographics_threaded(df, basedate):
    num_splits = 4
    dfs = np.array_split(df, num_splits)
    pool = Pool(processes=num_splits)
    names = ['_thread' + str(i) + '_' for i in range(num_splits)]
    basedate = [basedate] * num_splits
    sk_dfs = pool.starmap(add_demographics, zip(dfs, basedate, names))
    return pd.concat(sk_dfs)
Beispiel #23
0
def run_asynch_test(function, dataset, iterations, poolsize=10):
    p = Pool(poolsize)
    iteration_list = list(range(0, iterations))
    dataset_list = list(repeat(dataset, iterations))
    results = p.starmap(function, zip(dataset_list, iteration_list))
    trained = [result[0] for result in results]
    random = [result[1] for result in results]
    return trained, random
def evaluate_regions(folder_predicted: str, folder_gt: str, regions: dict, processes=default_num_threads):
    region_names = list(regions.keys())
    files_in_pred = subfiles(folder_predicted, suffix='.nii.gz', join=False)
    files_in_gt = subfiles(folder_gt, suffix='.nii.gz', join=False)
    have_no_gt = [i for i in files_in_pred if i not in files_in_gt]
    assert len(have_no_gt) == 0, "Some files in folder_predicted have not ground truth in folder_gt"
    have_no_pred = [i for i in files_in_gt if i not in files_in_pred]
    if len(have_no_pred) > 0:
        print("WARNING! Some files in folder_gt were not predicted (not present in folder_predicted)!")

    files_in_gt.sort()
    files_in_pred.sort()

    # run for all cases
    full_filenames_gt = [join(folder_gt, i) for i in files_in_pred]
    full_filenames_pred = [join(folder_predicted, i) for i in files_in_pred]

    p = Pool(processes)
    res = p.starmap(evaluate_case, zip(full_filenames_pred, full_filenames_gt, [list(regions.values())] * len(files_in_gt)))
    p.close()
    p.join()

    all_results = {r: [] for r in region_names}
    with open(join(folder_predicted, 'summary.csv'), 'w') as f:
        f.write("casename")
        for r in region_names:
            f.write(",%s" % r)
        f.write("\n")
        for i in range(len(files_in_pred)):
            f.write(files_in_pred[i][:-7])
            result_here = res[i]
            for k, r in enumerate(region_names):
                dc = result_here[k]
                f.write(",%02.4f" % dc)
                all_results[r].append(dc)
            f.write("\n")

        f.write('mean')
        for r in region_names:
            f.write(",%02.4f" % np.nanmean(all_results[r]))
        f.write("\n")
        f.write('median')
        for r in region_names:
            f.write(",%02.4f" % np.nanmedian(all_results[r]))
        f.write("\n")

        f.write('mean (nan is 1)')
        for r in region_names:
            tmp = np.array(all_results[r])
            tmp[np.isnan(tmp)] = 1
            f.write(",%02.4f" % np.mean(tmp))
        f.write("\n")
        f.write('median (nan is 1)')
        for r in region_names:
            tmp = np.array(all_results[r])
            tmp[np.isnan(tmp)] = 1
            f.write(",%02.4f" % np.median(tmp))
        f.write("\n")
Beispiel #25
0
def FilterAllOrganisedFiles():
    TotalTime = time.time()

    # Get all the WAV files under resources
    # wavFiles = glob.glob(join("resources", "f2cnn", "*", "*.WAV"))
    wavFiles = glob.glob(os.path.join("resources", "f2cnn", "**", "*.WAV"))

    print(
        "\n###############################\nApplying FilterBank to files in '{}'."
        .format(os.path.split(wavFiles[0])[0]))

    if not wavFiles:
        print("NO WAV FILES FOUND, PLEASE ORGANIZE FILES")
        exit(-1)

    print(len(wavFiles), "files found")

    # #### READING CONFIG FILE
    config = ConfigParser()
    config.read('configF2CNN.conf')
    framerate = config.getint('FILTERBANK', 'FRAMERATE')
    nchannels = config.getint('FILTERBANK', 'NCHANNELS')
    lowcutoff = config.getint('FILTERBANK', 'LOW_FREQ')
    # ##### PREPARATION OF FILTERBANK
    # CENTER FREQUENCIES ON ERB SCALE
    CENTER_FREQUENCIES = filters.centre_freqs(framerate, nchannels, lowcutoff)
    # Filter coefficient for a Gammatone filterbank
    FILTERBANK_COEFFICIENTS = filters.make_erb_filters(framerate,
                                                       CENTER_FREQUENCIES)

    # Usage of multiprocessing, to reduce computing time
    proc = cpu_count()
    counter = Value('i', 0)
    multiproc_pool = Pool(processes=proc,
                          initializer=InitProcesses,
                          initargs=(
                              FILTERBANK_COEFFICIENTS,
                              counter,
                          ))
    multiproc_pool.starmap(GammatoneFiltering,
                           zip(wavFiles, repeat(len(wavFiles))))

    print("Filtered and Saved all files.")
    print('                Total time:', time.time() - TotalTime)
    print('')
def main(argv):
    # parse command line
    parser = argparse.ArgumentParser(description="scrape national wikipedia data on coronavirus",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-z', '--tgz', action="store_true", default=False,
                        help="compress result directory to a single tgz file")
    parser.add_argument('-t', '--timeout', action="store", default = 10, type=float,
                        help="http fetch timeout")
    parser.add_argument('-w', '--wikiprefix', action="store", 
                        default = "https://en.m.wikipedia.org",
                        help="URL prefix of localized wikipedia pages")
    parser.add_argument('-l', '--listurl', action="store",
                        default = "https://en.m.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data#covid19-container",
                        help="URL with list of countries/territories to fetch")
    parser.add_argument('-v', '--verbose', action="store_true", default=False,
                        help="print more info")
    parser.add_argument('-j', '--threads', action="store", default=128, type=int,
                        help="parallel download threads")
    args = parser.parse_args(argv)

    # create output directory
    outdir = slugify("output-" + datetime.now(tz=None).strftime("%d-%b-%Y (%H:%M:%S.%f)"),
                     replacements=[['%','_percent_'],[':','-']])
    os.mkdir(outdir)


    # fetch list of country pages in parallel
    # we get the country list from the table in args.listurl
    country_list = requests.get(args.listurl).text
    country_soup = BeautifulSoup(country_list, 'lxml')
    loc_tbl = country_soup.find('table')
    rows = loc_tbl.find_all('tr')

    # end the list of rows when we see class="sortbottom" in a <tr> tag
    idx = next(i for i,tr in enumerate(rows) if tr.has_attr('class') and 'sortbottom' in tr['class'])
    rows = rows[0:idx-1]

    # in order to parallelize, need to convert soup back to strings
    rows = [str(tr) for tr in rows]

    pool = Pool(args.threads)
    # starmap only takes two args, so we zip the rows with the constant args into pairs
    poolargs = [outdir, args.wikiprefix, args.timeout, args.verbose]
    results = pool.starmap(fetch_countries, zip(rows, repeat(poolargs)))
    # remove non-errors
    errors = list(filter(None, results))

    if args.tgz:
        outfile = outdir + '.tgz'
        os.system('tar czf ' + outfile + ' ' + outdir)
        os.system('rm -rf ' + outdir)
        if args.verbose:
            cprint('created output file ' + outfile)
    if len(errors) > 0:
        cprint("errors encountered fetching these: " + str(errors), 'red')
    else:
        print("all pages fetched successfully")
def aggregate_simulation_empirical_im_permutations(
    fsf, n_processes, sim_root, version, logger: Logger = get_basic_logger()):
    events = load_fault_selection_file(fsf)
    logger.debug(f"Loaded {len(events)} events from the fault selection file")
    events = [
        name if count == 1 else get_realisation_name(name, 1)
        for name, count in events.items()
    ]
    worker_pool = Pool(n_processes)
    worker_pool.starmap(
        agg_emp_perms,
        [(
            pathlib.Path(get_empirical_dir(sim_root, event)),
            event,
            version,
            get_realisation_logger(logger, event).name,
        ) for event in events],
    )
Beispiel #28
0
 def match_object_ids(ra, dec, limit_angle='2 arcsec', name_order=None):
     """Get the id from Simbad for every object in a RA, Dec list."""
     # Perform it in parallel to handle the online query overhead
     func = partial(simbad_query_id,
                    name_order=name_order,
                    limit_angle=limit_angle)
     p = Pool(MAX_PARALLEL_QUERY)
     results = p.starmap(func, list(zip(ra, dec)))
     return results
Beispiel #29
0
    def evaluator(ref: ndarray,
                  deg: ndarray,
                  sr: int,
                  pool: Pool = None) -> float:
        length = len(ref)
        assert length == len(deg)

        return (sum(
            pool.starmap(func=func,
                         iterable=((r, d, sr)
                                   for (r, d) in zip(ref, deg)))) / length)
Beispiel #30
0
def main():
    """Used to test the Archipelago class.

    Tests 3 random seeds between 0 and 65535 with weathering values of 1, 3, 5 and sea_level values between -20 and 32
    in steps of 4.
    """
    t = TicToc()
    t.tic()
    args = []
    for seed in random.sample(range(0, int("0xFFFF", 16)), 3):
        for weathering in [1, 3, 5]:
            for sea_level in range(-20, 32, 4):
                sea_level = sea_level / 100  # range() can't be used to generate a list of floats
                args.append([seed, weathering, sea_level])
    pool = Pool(multiprocessing.cpu_count())
    print("Total archipelagos being generated:", len(args))
    pool.starmap(test, args)
    pool.close()
    pool.join()
    t.toc()
    print("Total time elapsed (seconds): {0:.2f}".format(t.elapsed))
def train_all_models_lgb_combined(combined_model_name, models_with_folds):
    X_all_combined = []
    y_all_combined = []

    requests = []
    results = []
    for model_with_folds in models_with_folds:
        for model_name, fold in model_with_folds:
            requests.append((model_name, fold))
            # results.append(load_one_model(requests[-1]))

    pool = Pool(40)
    with utils.timeit_context('load all data'):
        results = pool.starmap(load_train_data, requests)

    for model_with_folds in models_with_folds:
        X_combined = []
        y_combined = []
        for model_name, fold in model_with_folds:
            X, y, video_ids = results[requests.index((model_name, fold))]
            print(model_name, fold, X.shape)
            X_combined.append(X)
            y_combined.append(y)

        X_all_combined.append(np.row_stack(X_combined))
        y_all_combined.append(np.row_stack(y_combined))

    X = np.column_stack(X_all_combined)
    y = y_all_combined[0]

    print(X.shape, y.shape)

    y_cat = np.argmax(y, axis=1)
    print(X.shape, y.shape)
    print(np.unique(y_cat))

    with utils.timeit_context('fit'):
        param = {
            'num_leaves': 50,
            'objective': 'multiclass',
            'max_depth': 5,
            'learning_rate': .05,
            'max_bin': 300,
            'num_class': NB_CAT,
            'metric': ['multi_logloss']
        }
        model = lgb.train(param,
                          lgb.Dataset(X, label=y_cat),
                          num_boost_round=260)

    pickle.dump(
        model, open(f"../output/lgb_combined_{combined_model_name}.pkl", "wb"))
    def _process(cls, spectrum, filter_spectrum, *args):
        """This private class method process spectrum with a given filter
        and parameters.

        .. note:: If no filter spectrum is provided, then it passes None as
                  filter spectrum point to all processes.

        This method uses multiprocessing.
        """
        if filter_spectrum:
            resampled_filter_spectrum = filter_spectrum.resample(spectrum)
            resampled_filter_lines = resampled_filter_spectrum.lines
        else:
            resampled_filter_lines = repeat(None, len(spectrum.lines))
        data = zip(spectrum.lines, resampled_filter_lines, repeat(args))
        p = Pool()
        y_values = p.starmap(cls._func, data)
        p.close()
        p.join()
        lines = zip(spectrum.x_values, y_values)
        return type(spectrum)(lines, interpolation=spectrum.interpolation)
    article = Article(URL_PREFIX + "/wiki/Special:Export/Template:Periodic_table")
    categories = []
    params = []
    for row in article.get_table("table 1"):
        for key, value in row.items():
            segments = [segment.strip() for segment in value.split(";")]
            if len(segments) >= 7:
                if segments[5].lower() not in categories:
                    categories.append(segments[5].lower())
                params.append(
                    (
                        segments[1],
                        segments[7].replace(" ", "_") if len(segments) > 7 else segments[1].capitalize(),
                        ionization_energies,
                        element_names,
                        categories.index(segments[5].lower()),
                    )
                )

    pool = Pool(processes=multiprocessing.cpu_count() * 2)

    json_data = pool.starmap(parse, params)
    pool.close()
    pool.join()

    # Save

    with open(OUTPUT_JSON, "w+") as outfile:
        json.dump(json_data, outfile, sort_keys=True, indent=4, ensure_ascii=False)