def crawl(dataset_path, scenes, subsequence_length, num_workers=1):
    pool = Pool(num_workers)
    manager = Manager()

    count = len(scenes)
    progress = manager.Value('i', 0)

    samples = []

    if subsequence_length == 2:
        for scene_samples in pool.imap_unordered(
                partial(crawl_subprocess_short,
                        dataset_path=dataset_path,
                        count=count,
                        progress=progress), scenes):
            samples.extend(scene_samples)

    else:
        for scene_samples in pool.imap_unordered(
                partial(crawl_subprocess_long,
                        dataset_path=dataset_path,
                        count=count,
                        progress=progress,
                        subsequence_length=subsequence_length), scenes):
            samples.extend(scene_samples)

    random.shuffle(samples)

    return samples
Beispiel #2
0
def t2():
    from gevent.pool import Pool  # noqa: E402

    p = Pool(10)
    run1 = [v for v in p.imap_unordered(echo, range(10))]
    run2 = [v for v in p.imap_unordered(echo, range(10))]
    run3 = [v for v in p.imap_unordered(echo, range(10))]
    run4 = [v for v in p.imap_unordered(echo, range(10))]

    print(run1 == run2 == run3 == run4)
    print(run1)
    print(run2)
Beispiel #3
0
def t1():
    from multiprocessing.pool import Pool  # noqa: E402

    p = Pool(10)
    run1 = [v for v in p.imap_unordered(echo, range(10))]
    run2 = [v for v in p.imap_unordered(echo, range(10))]
    run3 = [v for v in p.imap_unordered(echo, range(10))]
    run4 = [v for v in p.imap_unordered(echo, range(10))]

    print(run1 == run2 == run3 == run4)
    print(run1)
    print(run2)
Beispiel #4
0
def test_concurrent_processes(get_dict):
    pool = Pool(16)

    with get_dict() as storage:
        filename = storage.filename

    for _ in pool.imap_unordered(partial(insert_range, filename=filename),
                                 split_seq(range(10000), 1000)):
        pass

    for _ in pool.imap_unordered(partial(remove_range, filename=filename),
                                 split_seq(range(10000), 1000)):
        pass

    assert sorted(iter(get_dict())) == list()
Beispiel #5
0
def SummaryMode(corpus, context_token_limit):
    for dataset in datasets:
        print 'Generating summaries for the %s set:' % dataset

        urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
        urls = ReadUrls(urls_filename)

        p = Pool()
        story_lists = p.imap_unordered(
            GenerateMapper,
            izip(urls, repeat(corpus), repeat(context_token_limit)))

        progress_bar = ProgressBar(len(urls))
        for story in story_lists:
            if story == None:
                continue
            url_hash = Hashhex(story.url)
            with open('%s/summary/%s/%s.sent' % (corpus, dataset, url_hash),
                      'w') as f:
                f.write(story.content)
            with open('%s/summary/%s/%s.summ' % (corpus, dataset, url_hash),
                      'w') as f:
                f.write(''.join(
                    [highlight + ".\n" for highlight in story.highlights]))

            progress_bar.Increment()
Beispiel #6
0
def cmd_null_minidump(pn):
    global DUMP
    print('Processing minidump:', pn)
    DUMP = Minidump(pn)

    # prepare args, i.e., mstr
    args = list()
    for mid in range(DUMP.mapping_cnt):
        m = DUMP.mappings[mid]
        args.append((mid, [], MapType.NONHEAP, m.is_writable))

    # multiprocessing for each mapping
    workers = Pool(NCPU)
    for (mid, s) in workers.imap_unordered(_null_mapping, args):
        m = DUMP.mappings[mid]
        if not s:
            continue
        for (va, sz) in list(DUMP.payloads):
            if va in m:
                s = s[:va - m.start] + DUMP.payloads[
                    (va, sz)] + s[va - m.start + sz:]
                del DUMP.payloads[(va, sz)]
        DUMP.write(m.stack_addr, s)
    # patch sparsely
    DUMP.save(pn, sparse=True)
    print('Finished:', pn)
Beispiel #7
0
def index_owl(owl_file_paths, output_properties, dist):
    maximum_lines_per_file = 50000
    prefix, temp_files, temp_dir = separate_large_owl(owl_file_paths, maximum_lines_per_file)
    base_dir = os.path.join(os.getcwd(), dist)

    if os.path.exists(base_dir):
        rmtree(base_dir)

    for output_property in output_properties.values():
        os.mkdir(os.path.join(temp_dir, output_property))

    os.mkdir(base_dir)
    print(i18n_t('cmd.build_index.info_collecting_info'))
    try:
        p = Pool()
        with tqdm(total=len(temp_files)) as pbar:
            for _ in p.imap_unordered(output_process, ((prefix, temp_file, output_properties, temp_dir) for temp_file in temp_files)):
                pbar.update(1)
        for op in output_properties.values():
            join_process((base_dir, temp_dir, op))
        with open(os.path.join(base_dir, 'prefix.ttl'), 'w') as fp:
            fp.write(prefix)
    finally:
        rmtree(temp_dir)

    return base_dir
Beispiel #8
0
def parse(document, pages, parse_refs=True,
        progress_monitor=NullProgressMonitor(),
        pool_size=DEFAULT_POOL_SIZE):
    progress_monitor.start('Parsing Pages', pool_size + 1)

    # Prepare input
    pages = [(page.local_url, page.url) for page in
            pages.values() if page.local_url is not None]
    pages_chunks = chunk_it(pages, pool_size)
    inputs = []
    for pages_chunk in pages_chunks:
        inputs.append((document.parser, document.pk, parse_refs, pages_chunk))

    # Close connection to allow the new processes to create their own.
    connection.close()

    # Split work
    progress_monitor.info('Sending {0} chunks to worker pool'
            .format(len(inputs)))
    pool = Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs, 1):
        progress_monitor.work('Parsed 1/{0} of the pages'.\
                format(pool_size), 1)

    # Word Count
    word_count = 0
    for page in document.pages.all():
        word_count += page.word_count
    document.word_count = word_count
    document.save()
    progress_monitor.work('Counted Total Words', 1)

    pool.close()
    progress_monitor.done()
Beispiel #9
0
def main():
    input_folder = Path("/home/ardaduz/HDD/Downloads/tum-rgbd-raw")
    output_folder = Path("/media/ardaduz/T5/test/tumrgbd")

    input_directories = [
        input_folder / "rgbd_dataset_freiburg1_desk",
        input_folder / "rgbd_dataset_freiburg1_plant",
        input_folder / "rgbd_dataset_freiburg1_room",
        input_folder / "rgbd_dataset_freiburg1_teddy",
        input_folder / "rgbd_dataset_freiburg1_xyz",
        input_folder / "rgbd_dataset_freiburg2_desk",
        input_folder / "rgbd_dataset_freiburg2_metallic_sphere2",
        input_folder / "rgbd_dataset_freiburg2_xyz",
        input_folder / "rgbd_dataset_freiburg3_cabinet",
        input_folder / "rgbd_dataset_freiburg3_long_office_household",
        input_folder / "rgbd_dataset_freiburg3_nostructure_notexture_far",
        input_folder / "rgbd_dataset_freiburg3_nostructure_texture_far",
        input_folder / "rgbd_dataset_freiburg3_structure_notexture_far",
        input_folder / "rgbd_dataset_freiburg3_structure_texture_far",
        input_folder / "rgbd_dataset_freiburg3_teddy"]

    pool = Pool(6)
    for finished_scene in pool.imap_unordered(partial(process_scene, output_folder=output_folder), input_directories):
        print("finished", finished_scene)

    pool.join()
    pool.close()
Beispiel #10
0
def create_vocab(inputs_path,
                 top_k=1000000000,
                 at_least=1,
                 pad="_PAD_",
                 unk="_UNK_",
                 processes=32):

    word_counts = {}
    pool = Pool(processes)

    for wc in pool.imap_unordered(_process_file, inputs_path.glob("*.json")):
        for k, v in wc.items():
            word_counts[k] = word_counts.get(k, 0) + v

    tokens_counts = sorted(word_counts.items(),
                           key=lambda x: x[1],
                           reverse=True)

    logging.info(" # Unique Words: {}".format(len(word_counts)))

    sorted_tokens_counts = [tc for tc in tokens_counts
                            if tc[1] >= at_least][:top_k]

    index2tokens = []
    if pad is not None:
        index2tokens.append(pad)
    if unk is not None:
        index2tokens.append(unk)
    index2tokens.extend([t for t, c in sorted_tokens_counts])
    tokens2index = {t: i for i, t in enumerate(index2tokens)}

    logging.info(" After filtering, # Unique Words: {}".format(
        len(tokens2index)))

    return Vocab(index2tokens, tokens2index, pad=pad, unk=unk)
Beispiel #11
0
def query_all_tweets(query, content_collection, search, year=2017, month=1):
    """
    Queries *all* tweets in the history of twitter for the given query. This
    will run in parallel for each ~10 days.

    :param query: A twitter advanced search query.
    :return: A list of tweets.
    """

    limits = []
    while date(year=year, month=month, day=1) < date.today():
        nextmonth = month + 1 if month < 12 else 1
        nextyear = year + 1 if nextmonth == 1 else year

        for i in range(1, 26, 2):
            limits.append((date(year=year, month=month, day=i), date(year=year, month=month, day=i + 2)))
        limits.append((date(year=year, month=month, day=28), date(year=nextyear, month=nextmonth, day=1)))
        year, month = nextyear, nextmonth

    queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in reversed(limits)]

    all_tweets = 0
    pool = Pool(20)
    try:
        for new_tweets in pool.imap_unordered(query_tweets_once, queries):
            all_tweets += len(new_tweets)
            insert_tweets(content_collection, query, search, new_tweets)
            if len(new_tweets) > 0:
                print("Got {} tweets ({} new) for {}.".format(all_tweets, len(new_tweets), new_tweets[0].timestamp))
            else:
                print("Got {} tweets ({} new).".format(all_tweets, len(new_tweets)))
    except KeyboardInterrupt:
        print("Program interrupted by user. Returning all tweets gathered so far.")
Beispiel #12
0
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''):
    no_days = (enddate - begindate).days
    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query,
		# the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]

    if limit:
        limit_per_pool = (limit // poolsize)+1
    else:
        limit_per_pool = None

    queries = ['{} since:{} until:{}'.format(query, since, until)
               for since, until in zip(dateranges[:-1], dateranges[1:])]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                         'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
Beispiel #13
0
def query_all_tweets(query, start_date, end_date):
    """
    Queries *all* tweets in the history of twitter for the given query. This
    will run in parallel for each ~30 days.

    :param query: A twitter advanced search query.
    :param start_date: Crawl start Date eg.20170101
    :param end_date: Crawl end Date eg.20171010
    :return: A list of tweets.
    """

    queries = get_all_query(query, start_date, end_date)

    pool = Pool(10)

    all_tweets = []

    try:
        for new_tweets in pool.imap_unordered(partial(query_tweets_once),
                                              queries):
            for new_tweet in new_tweets:
                all_tweets.append(new_tweet)

    except KeyboardInterrupt:
        logging.info("Program interrupted by user. Returning all tweets "
                     "gathered so far.")

    return sorted(all_tweets, reverse=True)
Beispiel #14
0
def _run_with_multiprocessing(process, total_tiles, zoom_levels, multi, quiet,
                              debug):
    LOGGER.debug("run with multiprocessing")
    num_processed = 0
    LOGGER.info("run process using %s workers", multi)
    f = partial(_process_worker, process)
    with tqdm.tqdm(total=total_tiles, unit="tiles",
                   disable=(quiet or debug)) as pbar:
        for zoom in zoom_levels:
            process_tiles = process.get_process_tiles(zoom)
            pool = Pool(multi)
            try:
                for output in pool.imap_unordered(f,
                                                  process_tiles,
                                                  chunksize=1):
                    if output:
                        _write_worker(process, output)
                    pbar.update()
                    num_processed += 1
            except KeyboardInterrupt:
                LOGGER.info("Caught KeyboardInterrupt, terminating workers")
                pool.terminate()
                break
            except Exception:
                pool.terminate()
                raise
            finally:
                pool.close()
                pool.join()
                process_tiles = None
    LOGGER.info("%s tile(s) iterated", (str(num_processed)))
def save_auroc(mode="bayes"):
	X_train, y_train, X_test, y_test = get_digits(0.5)

	cores = 4
	pool = Pool(processes=cores)
	estimates = {}
	for k in [20, 30]:
		estimates[k] = {}
		for digit_estimated in [0, 1, 2, 3]:
			condition = (y_train == digit_estimated)
			selected_digits = X_train[np.where(condition)]

			durations = []
			local_estimates = []
			if mode == "bayes":
				est_fun = threaded_estimates
			else:
				est_fun = threaded_estimates_EM

			for estimate, duration in pool.imap_unordered(est_fun, get_data(selected_digits, n=10, K=k)):
				local_estimates.append(estimate)

			estimates[k][digit_estimated] = average_of_estimates(local_estimates, 10)
			print(f"Done k={k}, digit #{digit_estimated}")
	with open(f"{mode}_auroc_allk.bin", "wb") as f:
		pickle.dump(estimates, f)
Beispiel #16
0
def query_tweets(query, limit=None, begindate=dt.date(2006,3,21), enddate=dt.date.today(), poolsize=20, lang=''):
    no_days = (enddate - begindate).days
    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query, 
		# the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]

    if limit:
        limit_per_pool = (limit // poolsize)+1
    else:
        limit_per_pool = None

    queries = ['{} since:{} until:{}'.format(query, since, until)
               for since, until in zip(dateranges[:-1], dateranges[1:])]

    all_tweets = []
    try:
        pool = Pool(poolsize)

        try:
            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
                all_tweets.extend(new_tweets)
                logging.info("Got {} tweets ({} new).".format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logging.info("Program interrupted by user. Returning all tweets "
                         "gathered so far.")
    finally:
        pool.close()
        pool.join()

    return all_tweets
Beispiel #17
0
def _run_with_multiprocessing(process, zoom_levels, multi, max_chunksize):
    logger.debug("run with multiprocessing")
    num_processed = 0
    total_tiles = process.count_tiles(min(zoom_levels), max(zoom_levels))
    logger.debug("run process on %s tiles using %s workers", total_tiles,
                 multi)
    f = partial(_process_worker, process)
    for zoom in zoom_levels:
        pool = Pool(multi, _worker_sigint_handler)
        try:
            for tile, message in pool.imap_unordered(
                    f,
                    process.get_process_tiles(zoom),
                    # set chunksize to between 1 and max_chunksize
                    chunksize=max_chunksize):
                num_processed += 1
                logger.debug("tile %s/%s finished", num_processed, total_tiles)
                yield dict(process_tile=tile, **message)
        except KeyboardInterrupt:
            logger.error("Caught KeyboardInterrupt, terminating workers")
            pool.terminate()
            raise
        except Exception:
            pool.terminate()
            raise
        finally:
            pool.close()
            pool.join()
    logger.debug("%s tile(s) iterated", (str(num_processed)))
Beispiel #18
0
def sampled_choice_sets_agreement(choice_sets, num_threads, model, epsilon):
    """
    Optimize agreement for 500 randomly sampled choice sets.
    :param choice_sets: the choice sets to sample from
    :param num_threads: number of threads to use
    :param model: a fitted DiscreteChoiceModel with two agents
    :param epsilon: approximation parameter
    """

    filtered_choice_sets = [
        x for x in choice_sets if 1 < np.count_nonzero(x) <= 5
    ]

    choice_set_indices = np.random.choice(range(len(filtered_choice_sets)),
                                          500,
                                          replace=False)
    sampled_choice_sets = [
        tuple(np.nonzero(filtered_choice_sets[i])[0])
        for i in choice_set_indices
    ]

    pool = Pool(num_threads)
    helper_partial = partial(agreement_helper, model=model, epsilon=epsilon)
    results = []

    for result in tqdm(pool.imap_unordered(helper_partial,
                                           sampled_choice_sets),
                       total=len(sampled_choice_sets)):
        results.append(result)

    pool.close()
    pool.join()

    return results
Beispiel #19
0
def create_masks(data_root_path='/data/SN7_buildings/train/',
                 result_path='/wdata/train_masks/'):

    if os.path.exists(result_path):
        shutil.rmtree(result_path)
    os.mkdir(result_path)
    ids = os.listdir(data_root_path)
    all_params = []
    for _id in tqdm(ids[:]):
        id_path = os.path.join(data_root_path, _id)
        if not os.path.isdir(id_path):
            continue
        sub_res_path = os.path.join(result_path, _id)
        os.mkdir(sub_res_path)
        labels_path = os.path.join(id_path, 'labels_match_pix')
        rasters_path = os.path.join(id_path, 'images')

        files = sorted(os.listdir(labels_path))
        files = [el for el in files if 'UDM' not in el]
        files = ['_'.join(el.split('.')[0].split('_')[:-1]) for el in files]
        params = [(el, labels_path, rasters_path, sub_res_path)
                  for el in files]
        all_params += params

    n_cpus = cpu_count()
    pool = Pool(n_cpus)
    for _ in tqdm(pool.imap_unordered(mask_fro_id, all_params),
                  total=len(all_params)):
        pass
def work_with_database(cursor: sqlite3.Cursor, args: Namespace, pool: Pool,
                       dims: int) -> None:
    """
    a function-helper

    :param cursor: a cursor for a database to work with
    :param args: additional arguments
    :param pool: a multiprocessing pool
    :param dims: semigroups cardinality
    :returns:
    """
    try:
        create_table_if_not_exists(
            cursor,
            TABLE_NAME,
            ["output STRING", "errors STRING"],
        )
        with tqdm(total=args.number_of_tasks) as progress_bar:
            for output, errors in pool.imap_unordered(
                    partial(
                        table_completion,
                        dims,
                        args.mace_timeout,
                        args.mace_memory_mb,
                    ),
                    range(args.number_of_tasks),
            ):
                insert_values_into_table(cursor, TABLE_NAME, (output, errors))
                progress_bar.update()
    finally:
        pool.close()
        pool.join()
Beispiel #21
0
def index_owl(owl_file_paths, output_properties, dist):
    prefix, temp_files, temp_dir = separate_large_owl(owl_file_paths)
    base_dir = os.path.join(os.getcwd(), dist)

    if os.path.exists(base_dir):
        rmtree(base_dir)

    for output_property in output_properties.values():
        os.mkdir(os.path.join(temp_dir, output_property))

    os.mkdir(base_dir)
    print('分割したファイルから情報を収集しています...')
    try:
        p = Pool()
        with tqdm(total=len(temp_files)) as pbar:
            for _ in p.imap_unordered(
                    output_process,
                ((prefix, temp_file, output_properties, temp_dir)
                 for temp_file in temp_files)):
                pbar.update(1)
        for op in output_properties.values():
            join_process((base_dir, temp_dir, op))
        with open(os.path.join(base_dir, 'prefix.ttl'), 'w') as fp:
            fp.write(prefix)
    finally:
        rmtree(temp_dir)

    return base_dir
Beispiel #22
0
def parallel_create_audio_data(audio_files: List[str],
                               sample_rate: int,
                               outfile: str,
                               mono=True,
                               max_seconds: Optional[int] = None):
    fn_args = []
    batch_size = 100

    for i in range(0, len(audio_files), batch_size):
        batch = audio_files[i:i + batch_size]
        if len(batch) == 0:
            continue

        batch_outfile = str(
            Path(outfile).parent / f"{Path(outfile).stem}.chunk.{i:05}.npz")
        if os.path.exists(batch_outfile):
            continue

        fn_args.append(
            (batch, sample_rate, batch_outfile, mono, max_seconds, False))

    pool = Pool()
    for _ in tqdm(pool.imap_unordered(wrapped_create_audio_data, fn_args),
                  total=len(fn_args)):
        pass
Beispiel #23
0
def download_using_parallel_processing(concepts, processors):
    pool = Pool(processors)
    counter = 0
    for result in pool.imap_unordered(download_concept, concepts):
        counter += 1
        if counter % 100 == 0:
            print_log("  Count: " + str(counter))
Beispiel #24
0
    def query_profile(self, profiles, poolsize=20):
        '''
        profiles: List
        Unique profies to scrape from

        poolsize: int
        Size of pool. Bigger - the more instance of browser is opened

        logger (logger):
        Made this mandatory here because of issues
        '''

        url = "https://twitter.com/{}"
        no_profiles = len(profiles)

        if (poolsize > no_profiles):
            poolsize = no_profiles

        urls = [url.format(x) for x in profiles]
        all_profile = []

        pool = Pool(poolsize)

        try:
            for profile_data in pool.imap_unordered(partial(self.query_single_profile), urls):
                all_profile.append(profile_data)
                self.logger.info("Got {} profiles (1 new).".format(len(all_profile)))

        finally:
            pool.close()
            pool.join()

        return all_profile
Beispiel #25
0
    def query_profile(self, profiles, poolsize=20):
        '''
        profiles: List
        Unique profies to scrape from

        poolsize: int
        Size of pool. Bigger - the more instance of browser is opened
        '''

        url = "https://twitter.com/{}"
        no_profiles = len(profiles)

        if (poolsize > no_profiles):
            poolsize = no_profiles

        urls = [url.format(x) for x in profiles]
        all_profiles = []

        pool = Pool(poolsize)
        profile_received = 0

        try:
            for profile in pool.imap_unordered(
                    partial(self.query_single_profile), urls):
                profile_received = profile_received + 1
                all_profiles.append(profile)
                print("Got {} profiles (1 new).".format(profile_received))
        finally:
            pool.close()
            pool.join()

        return all_profiles
Beispiel #26
0
def query_tweets(query, limit=None, begindate=dt.date(2017,1,1), enddate=dt.date.today(), poolsize=20, lang=''):
    no_days = (enddate - begindate).days
    stepsize = roundup(no_days,  poolsize)
    dateranges = [begindate + dt.timedelta(days=elem) for elem in range(0,no_days,stepsize)]
    dateranges.append(enddate)

    if limit:
        limit_per_pool = roundup(limit, poolsize)
    else:
        limit_per_pool = None

    queries = ['{} since:{} until:{}'.format(query, since, until)
               for since, until in zip(dateranges[:-1], dateranges[1:])]

    all_tweets = []
    try:
        pool = Pool(poolsize)

        try:
            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
                all_tweets.extend(new_tweets)
                logging.info("Got {} tweets ({} new).".format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logging.info("Program interrupted by user. Returning all tweets "
                         "gathered so far.")
    finally:
        pool.close()
        pool.join()

    return all_tweets
Beispiel #27
0
def validation_loss_grid_search(datasets, methods, update=False):
    lrs = [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
    wds = [0, 0.0001, 0.0005, 0.001, 0.005, 0.01]

    params = {(dataset, method, lr, wd)
              for dataset in datasets for lr in lrs for method in methods
              for wd in wds}

    results = dict()

    pool = Pool(THREADS)

    for args, losses in tqdm(pool.imap_unordered(
            validation_loss_grid_search_helper, params),
                             total=len(params)):
        results[args] = losses

    pool.close()
    pool.join()

    filename = f'{CONFIG_DIR}/validation_loss_lr_wd_settings.pickle'

    if update:
        with open(filename, 'rb') as f:
            old_results, old_datasets, old_methods, old_lrs, old_wds = pickle.load(
                f)

        old_results.update(results)
        results = old_results
        datasets = list(set(old_datasets).union(datasets))
        lrs = sorted(set(old_lrs).union(lrs))
        wds = sorted(set(old_wds).union(wds))

    with open(filename, 'wb') as f:
        pickle.dump((results, datasets, methods, lrs, wds), f)
Beispiel #28
0
def learning_rate_grid_search(datasets, methods, update=False):
    lrs = [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
    params = {(dataset, method, lr)
              for dataset in datasets for lr in lrs for method in methods}

    results = dict()

    pool = Pool(THREADS)

    for args, loss in tqdm(pool.imap_unordered(
            learning_rate_grid_search_helper, params),
                           total=len(params)):
        results[args] = loss

    pool.close()
    pool.join()

    filename = f'{CONFIG_DIR}/learning_rate_settings.pickle'

    if update:
        with open(filename, 'rb') as f:
            old_results, old_lrs = pickle.load(f)

        old_results.update(results)
        results = old_results
        lrs = sorted(set(old_lrs).union(lrs))

    with open(filename, 'wb') as f:
        pickle.dump((results, lrs), f)
Beispiel #29
0
def do_multiprocess(images,
                    measure,
                    num_processes,
                    is_rotation_invariant=False):
    rotations = None
    if is_rotation_invariant:
        rotations = dict()
        for path, img in images:
            rotations[path] = (rotate(img, 90), rotate(img,
                                                       180), rotate(img, 270))

    pool = Pool(num_processes)
    doer = Doer(measure, is_rotation_invariant, rotations)

    n_images = len(images)
    n_combinations = factorial(n_images) / (factorial(2) *
                                            factorial(n_images - 2))

    records = []
    with tqdm(total=n_combinations) as pbar:
        for record in tqdm(
                pool.imap_unordered(doer.do,
                                    list(itertools.combinations(images, 2)),
                                    chunksize=50)):
            records.append(record)
            pbar.update()

    return records
Beispiel #30
0
def main():
    mp = Mpool(10)

    run1 = [a for a in mp.imap_unordered(echo, xrange(10))]
    run2 = [a for a in mp.imap_unordered(echo, xrange(10))]
    run3 = [a for a in mp.imap_unordered(echo, xrange(10))]
    run4 = [a for a in mp.imap_unordered(echo, xrange(10))]

    print(run1 == run2 == run3 == run4)

    gp = Gpool(10)
    run1 = [a for a in gp.imap_unordered(echo, xrange(10))]
    run2 = [a for a in gp.imap_unordered(echo, xrange(10))]
    run3 = [a for a in gp.imap_unordered(echo, xrange(10))]
    run4 = [a for a in gp.imap_unordered(echo, xrange(10))]

    print(run1 == run2 == run3 == run4)
Beispiel #31
0
def multiproc_eval(func, ):
    count = inputs.shape[0]
    global pool
    if pool is None:
        pool = Pool(processes=20)
    return list(
        tqdm(pool.imap_unordered(partial(eval, func), range(count)),
             total=count))
def download_list(api_k, hash_list):
    global api_key
    if api_k:
        api_key = api_k
    files = json.load(open(hash_list))
    pool = Pool(os.cpu_count())
    for _ in tqdm.tqdm(pool.imap_unordered(download_file_by_hash, files),
                       total=len(files)):
        pass
def main():
	
	# non deterministic process pool
	from multiprocessing.pool import Pool	
	p = Pool(10)	
	run1 = [a for a in p.imap_unordered(echo, xrange(10))]
	run2 = [a for a in p.imap_unordered(echo, xrange(10))]
	run3 = [a for a in p.imap_unordered(echo, xrange(10))]
	run4 = [a for a in p.imap_unordered(echo, xrange(10))]	
	print(run1, run2, run3, run4)	
	print(run1 == run2 == run3 == run4)
	
	
	# deterministic gevent pool
	from gevent.pool import Pool	
	p = Pool(10)	
	run1 = [a for a in p.imap_unordered(echo, xrange(10))]
	run2 = [a for a in p.imap_unordered(echo, xrange(10))]
	run3 = [a for a in p.imap_unordered(echo, xrange(10))]
	run4 = [a for a in p.imap_unordered(echo, xrange(10))]		
	print(run1, run2, run3, run4)	
	print(run1 == run2 == run3 == run4)
Beispiel #34
0
def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER):
    print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder)

    #Zip the filename input with the output folder
    tuple_input = zip(filenames, [out_folder]*len(filenames))

    pool = Pool(processes=util.CPU_COUNT)
    #pool = Pool(processes=1)
    num_tasks = len(filenames)
    for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1):
        sys.stderr.write('\rdone {0:%}'.format(i/num_tasks))
    pool.close()

    print "\nDONE"
def StoreMode(corpus):
    for dataset in datasets:
        print "Storing news stories for the %s set:" % dataset
        urls_filename = "%s/wayback_%s_urls.txt" % (corpus, dataset)
        urls = ReadUrls(urls_filename)

        p = Pool()
        stories = p.imap_unordered(StoreMapper, izip(urls, repeat(corpus)))

        progress_bar = ProgressBar(len(urls))
        for story in stories:
            if story:
                WriteStory(story, corpus)

            progress_bar.Increment()
Beispiel #36
0
def run(config_uri, app_name=None, username=None, types=(), batch_size=500, processes=None):
    # multiprocessing.get_context is Python 3 only.
    from multiprocessing import get_context
    from multiprocessing.pool import Pool

    # Loading app will have configured from config file. Reconfigure here:
    logging.getLogger('snovault').setLevel(logging.DEBUG)

    testapp = internal_app(config_uri, app_name, username)
    connection = testapp.app.registry[CONNECTION]
    uuids = [str(uuid) for uuid in connection.__iter__(*types)]
    transaction.abort()
    logger.info('Total items: %d' % len(uuids))

    pool = Pool(
        processes=processes,
        initializer=initializer,
        initargs=(config_uri, app_name, username),
        context=get_context('forkserver'),
    )

    all_results = []
    try:
        for result in pool.imap_unordered(worker, batched(uuids, batch_size), chunksize=1):
            results = result['results']
            errors = sum(error for item_type, path, update, error in results)
            updated = sum(update for item_type, path, update, error in results)
            logger.info('Batch: Updated %d of %d (errors %d)' %
                        (updated, len(results), errors))
            all_results.extend(results)
    finally:
        pool.terminate()
        pool.join()

    def result_item_type(result):
        # Ensure we always return a string
        return result[0] or ''

    for item_type, results in itertools.groupby(
            sorted(all_results, key=result_item_type), key=result_item_type):
        results = list(results)
        errors = sum(error for item_type, path, update, error in results)
        updated = sum(update for item_type, path, update, error in results)
        logger.info('Collection %s: Updated %d of %d (errors %d)' %
                    (item_type, updated, len(results), errors))
def GenerateMode(corpus, context_token_limit):
  for dataset in datasets:
    print 'Generating questions for the %s set:' % dataset

    urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
    urls = ReadUrls(urls_filename)

    p = Pool()
    question_context_lists = p.imap_unordered(
        GenerateMapper, izip(urls, repeat(corpus), repeat(context_token_limit)))

    progress_bar = ProgressBar(len(urls))
    for question_context_list in question_context_lists:
      if question_context_list:
        for question_context in question_context_list:
          WriteQuestionContext(question_context, corpus, dataset)

      progress_bar.Increment()
    label_indeces = load_labels()
    raw_features = load_raw_features()
    print "Loaded {0} features".format(len(raw_features))

    print "Grouping prevectors by base_url"
    sites = {}
    site_labels = {}
    for dp in data_points:
        if dp['base_url'] not in sites:
            sites[dp['base_url']] = {}
            site_labels[dp['base_url']] = dp['label']

        sites[dp['base_url']][dp['offset']] = {"code": dp['code'], "content_ssdeep": dp['content_ssdeep']}

    print "Vectorizing {0} base urls".format(len(sites))
    labels = []
    names = []
    vectors = []
    pool = Pool(processes=cpu_count(), initializer=preload_process, initargs=(sites,))
    for vector, site in pool.imap_unordered(compute_vectors, sites.keys()):
        if site_labels[site] in labels_to_ignore:
            continue
        vectors.append(vector)
        labels.append(site_labels[site])
        names.append(site)
        print "Vector for {0} completed".format(site)

    with open("raw_feature_vectors.json", "w") as f:
        json.dump({"labels": labels, "names": names, "vectors": vectors}, f)
Beispiel #39
0
def main(force_reanalyze=False, include_hidden=False,
         dry_run=False, gain_type='auto',
         jobs=default_job_count(),
         quiet=False, verbose=False,
         *music_directories
         ):
    """Add replaygain tags to your music files."""
    if quiet:
        logging.basicConfig(level=logging.WARN)
    elif verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    # Some pesky functions used below will catch KeyboardInterrupts
    # inappropriately, so install an alternate handler that bypasses
    # KeyboardInterrupt instead.
    def signal_handler(sig, frame):
        print "Canceled."
        os.kill(os.getpid(), signal.SIGTERM)
    original_handler = signal.signal(signal.SIGINT, signal_handler)

    track_class = RGTrack
    if dry_run:
        logging.warn('This script is running in "dry run" mode, so no files will actually be modified.')
        track_class = RGTrackDryRun
    if len(music_directories) == 0:
        logging.error("You did not specify any music directories or files. Exiting.")
        sys.exit(1)

    logging.info("Searching for music files in the following directories:\n%s", "\n".join(music_directories),)
    tracks = [ track_class(f) for f in get_all_music_files(music_directories, ignore_hidden=(not include_hidden)) ]

    # Filter out tracks for which we can't get the length
    for t in tracks[:]:
        try:
            len(t)
        except Exception:
            logging.error("Track %s appears to be invalid. Skipping.", t.filename)
            tracks.remove(t)

    if len(tracks) == 0:
        logging.error("Failed to find any tracks in the directories you specified. Exiting.")
        sys.exit(1)
    track_sets = RGTrackSet.MakeTrackSets(tracks)

    # Remove the earlier bypass of KeyboardInterrupt
    signal.signal(signal.SIGINT, original_handler)

    logging.info("Beginning analysis")
    handler = TrackSetHandler(force=force_reanalyze, gain_type=gain_type)

    # For display purposes, calculate how much granularity is required
    # to show visible progress at each update
    total_length = sum(len(ts) for ts in track_sets)
    min_step = min(len(ts) for ts in track_sets)
    places_past_decimal = max(0,int(math.ceil(-math.log10(min_step * 100.0 / total_length))))
    update_string = '%.' + str(places_past_decimal) + 'f%% done'

    import gst
    pool = None
    try:
        if jobs == 1:
            # Sequential
            handled_track_sets = imap(handler, track_sets)
        else:
            # Parallel
            pool = Pool(jobs)
            handled_track_sets = pool.imap_unordered(handler,track_sets)
        processed_length = 0
        percent_done = 0
        for ts in handled_track_sets:
            processed_length = processed_length + len(ts)
            percent_done = 100.0 * processed_length / total_length
            logging.info(update_string, percent_done)
        logging.info("Analysis complete.")
    except KeyboardInterrupt:
        if pool is not None:
            logging.debug("Terminating process pool")
            pool.terminate()
            pool = None
        raise
    finally:
        if pool is not None:
            logging.debug("Closing transcode process pool")
            pool.close()
    if dry_run:
        logging.warn('This script ran in "dry run" mode, so no files were actually modified.')
    pass
Beispiel #40
0
 def imap_unordered(self, func, iterable, chunksize=1):
     """
     Override multiprocessing.Pool.imap_unordered() method such that it logs full exception stack trace from child process.
     """
     return Pool.imap_unordered(self, LogExceptions(func), iterable, chunksize)
Beispiel #41
0
import time

def echo(i):
    time.sleep(0.001)
    return i

from multiprocessing.pool import Pool
p = Pool(10)

print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]

# ^ Is this distribution random ?

from gevent.pool import Pool

p = Pool(10)

print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]
print [a for a in p.imap_unordered(echo, xrange(10))]

    import inspect
    from scipy.stats import f_oneway
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    from collections import defaultdict, Counter
    from multiprocessing.pool import Pool
    from itertools import cycle

    completed = []
    container = defaultdict(list)   
    pool = Pool()

    sim_list = [func for name,func in inspect.getmembers(simulations, inspect.isfunction) if name.startswith('sim_')]

    try:
        print('press CTRL-c to stop generating samples')
        it = pool.imap_unordered(f, cycle(sim_list))
        
        while 1:
            sim, result = it.next(timeout=SIMULATION_TIMEOUT)
            completed.append(sim)
            sys.stdout.write('.')
            for p, wins in result.items():
                container[p].append( (sim, wins) )
            
    except KeyboardInterrupt:
        pool.close()
        print('stopping all simulations...')
    finally:
        pool.terminate()
        pool.join()
from operator import attrgetter
from collections import namedtuple
from multiprocessing.pool import Pool

from jinja2 import Environment, FileSystemLoader
import requests
import feedparser

import config

SearchResult = namedtuple('SearchResult', ['title', 'url'])

pool = Pool(5)

if __name__ == '__main__':
    feeds = pool.imap_unordered(feedparser.parse, config.SEARCH_FEEDS)
    entries = chain.from_iterable(map(attrgetter('entries'), feeds))
    unique_entries = dict((v['link'], v) for v in entries).values()
    results = [SearchResult(entry.title, entry.link) for entry in unique_entries]

    if results:
        env = Environment(autoescape=True, loader=FileSystemLoader('templates'))
        template = env.get_template('notification.html')
        email_msg = template.render(title=config.EMAIL_SUBJECT, results=results)

        requests.post(config.MAILGUN_URL,
                      auth=("api", config.MAILGUN_KEY),
                      data={
                          "from": config.MAILGUN_EMAIL_SENDER,
                          "to": config.SEND_NOTIFICATIONS_TO,
                          "subject": config.EMAIL_SUBJECT,
Beispiel #44
0
import os
import time

def echo(i):
    time.sleep(0.001)
    print os.getpid()
    return i

# Non Deterministic Process Pool

from multiprocessing.pool import Pool

p = Pool(10)
run1 = [a for a in p.imap_unordered(echo, xrange(10))]
run2 = [a for a in p.imap_unordered(echo, xrange(10))]
run3 = [a for a in p.imap_unordered(echo, xrange(10))]
run4 = [a for a in p.imap_unordered(echo, xrange(10))]

print( run1 == run2 == run3 == run4 )
print
print
# Deterministic Gevent Pool

from gevent.pool import Pool

p = Pool(10)
run1 = [a for a in p.imap_unordered(echo, xrange(10))]
run2 = [a for a in p.imap_unordered(echo, xrange(10))]
run3 = [a for a in p.imap_unordered(echo, xrange(10))]
run4 = [a for a in p.imap_unordered(echo, xrange(10))]
def task(pid):
    print('Starting task %d' % (pid,))
    time.sleep(random.randint(0,5))
    print('Finished task %d' % (pid,))
    return pid**2

p = Pool(processes=5)

#result = p.apply(task, [1])

#async_result = p.apply_async(task,[1])
#print async_result.ready()
#result = async_result.get()
#print result

#mapresult = p.map(task,xrange(0,10))
#print mapresult

#async_mapresult = p.map_async(task,xrange(0,10))
#print async_mapresult.ready()
#result = async_mapresult.get()
#print result

#imapresult = p.imap(task,xrange(0,10))
#for result in imapresult:
#    print result

imapresult_unordered = p.imap_unordered(task,xrange(0,10))
for result in imapresult_unordered:
    print result