Example #1
0
    def serviceChaos(self, maximumTime, minimumTime):
        try:
            dic = ServicesParser.getServiceDic()
            pool = Pool(processes=10)
            while 1:
                t = self.getRandomInt(maximumTime - minimumTime) + minimumTime
                self.countDown(t)
                serviceDic = self.updateServicesState()
                componentList = serviceDic.keys()

                runningList, stopList = self.generateServiceList(serviceDic)
                ran = random.randint(0, 1)
                address = ''
                randomComponent = ''
                jobList = []
                cmd = ''
                if ran:
                    index = self.getRandomInt(len(runningList))
                    if index != -1:
                        cmd = ' stop'
                        randomComponent = runningList[index]
                        print("About to stop: " + randomComponent)
                        addressList = serviceDic[randomComponent].keys()
                        address = addressList[self.getRandomInt(
                            len(addressList))]
                    else:
                        logger.info("Doing nothing")
                        continue
                else:
                    index = self.getRandomInt(len(stopList))
                    if index != -1:
                        cmd = ' start'
                        randomComponent = stopList[index]
                        print("About to start: " + randomComponent)
                        addressList = serviceDic[randomComponent].keys()
                        address = addressList[self.getRandomInt(
                            len(addressList))]
                    else:
                        logger.info("Doing nothing")
                        continue

                for service in dic[randomComponent]:
                    if service != "" and cmd != '':
                        jobList.append([
                            address, service, cmd, 'unknown', randomComponent
                        ])
                logger.info(jobList)
                pool.imap(doJob, jobList)

        except KeyboardInterrupt:
            print("Stopping chaos...")
            sys.exit(0)
        except Exception as inst:
            logger.error(str(inspect.stack()[0][3]))
            logger.info('calling func : ' + str(inspect.stack()[1][3]) +
                        '() from ' + str(inspect.stack()[1][1]))
            ervicesState

            logger.error(inst.args)
            sys.exit(0)
Example #2
0
    def serviceChaos(self, maximumTime, minimumTime):
        try:
            dic = ServicesParser.getServiceDic()
            pool = Pool(processes=10)
            while 1:
                t = self.getRandomInt(maximumTime - minimumTime) + minimumTime
                self.countDown(t)
                serviceDic = self.updateServicesState()
                componentList = serviceDic.keys()

                runningList, stopList = self.generateServiceList(serviceDic)
                print "runningList : ", runningList
                print "stopList : " , stopList
                ran = random.randint(0, 1)
                address = ''
                randomComponent = ''
                jobList = []
                cmd = ''
                if ran:
                    index = self.getRandomInt(len(runningList))
                    if index != -1:
                        cmd = ' stop'
                        randomComponent = runningList[index]
                        print("About to stop: " + randomComponent)
                        addressList = serviceDic[randomComponent].keys()
                        address = addressList[self.getRandomInt(len(addressList))]
                    else:
                        logger.info("Doing nothing")
                        continue
                else:
                    index = self.getRandomInt(len(stopList))
                    if index != -1:
                        cmd = ' start'
                        randomComponent = stopList[index]
                        print("About to start: " + randomComponent)
                        addressList = serviceDic[randomComponent].keys()
                        address = addressList[self.getRandomInt(len(addressList))]
                    else:
                        logger.info("Doing nothing")
                        continue

                for service in dic[randomComponent]:
                    if service != "" and cmd != '':
                        jobList.append([address, service, cmd, 'unknown', randomComponent])
                logger.info(jobList)
                pool.imap(doJob, jobList)

        except KeyboardInterrupt:
            print("Stopping chaos...")
            sys.exit(0)
        except Exception as inst:
            logger.error(str(inspect.stack()[0][3]))
            logger.info('calling func : ' + str(inspect.stack()[1][3]) + '() from ' + str(inspect.stack()[1][1]))
            ervicesState

            logger.error(inst.args)
            sys.exit(0)
Example #3
0
        def wrapper(iterable, *args, **kwargs):
            def starfunc(iterable):
                return func(iterable, *args, **kwargs)

            try:
                iter(iterable)
            except TypeError:
                return func(iterable, *args, **kwargs)
            if thread_count is None:
                current_thread_count = MAX_THREADS
            else:
                current_thread_count = set_threads(thread_count,
                                                   set_global=False)
            with multiprocessing.pool.ThreadPool(current_thread_count) as pool:
                if return_results:
                    results = []
                    for result in progress_callback(
                            pool.imap(starfunc, iterable),
                            total=len(iterable),
                            include_progress_callback=include_progress_callback
                    ):
                        results.append(result)
                    return results
                else:
                    for result in progress_callback(
                            pool.imap_unordered(starfunc, iterable),
                            total=len(iterable),
                            include_progress_callback=include_progress_callback
                    ):
                        pass
Example #4
0
def parallelCCompile(
    self,
    sources,
    output_dir=None,
    macros=None,
    include_dirs=None,
    debug=0,
    extra_preargs=None,
    extra_postargs=None,
    depends=None,
):
    """Build sources in parallel.

    Reference link:
    http://stackoverflow.com/questions/11013851/speeding-up-build-process-with-distutils
    Monkey-patch for parallel compilation.
    """
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
        output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    import multiprocessing.pool

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)

    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool()
    list(pool.imap(_single_compile, objects))
    return objects
Example #5
0
def parallelCCompile(self,
                     sources,
                     output_dir=None,
                     macros=None,
                     include_dirs=None,
                     debug=0,
                     extra_preargs=None,
                     extra_postargs=None,
                     depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
        output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    N = 2 * multiprocessing.cpu_count()  # number of parallel compilations

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        newcc_args = cc_args
        if _platform == "darwin":
            if src.endswith('.cpp'):
                newcc_args = cc_args + ["-stdlib=libc++"
                                        ] + ["-mmacosx-version-min=10.14"]
        self._compile(obj, src, ext, newcc_args, extra_postargs, pp_opts)

    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool(N)
    list(pool.imap(_single_compile, objects))
    return objects
Example #6
0
def main(args, session):
    if args.reanalyze_recordings:
        logging.info('Deleting all sonogram analyses')
        session.query(SonogramAnalysis).delete()

    logging.info('Fetching all recordings for selected species')
    recordings = session.query(Recording)\
        .join(Species, Species.scientific_name == Recording.scientific_name)\
        .join(SelectedSpecies)\
        .filter(Recording.sonogram_url_small != None, # pylint: disable=singleton-comparison
                Recording.sonogram_url_small != '',
                ~Recording.sonogram_analysis.has())\
        .all()

    logging.info('Analyzing recordings')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.analysis_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)

        for recording_id, sonogram_quality in progress.percent(
                pool.imap(_analyze, [(r.recording_id, r.sonogram_url_small) for r in recordings]),
                len(recordings)):
            session.add(SonogramAnalysis(
                recording_id=recording_id,
                sonogram_quality=sonogram_quality))
            session.commit()
Example #7
0
def parallelCCompile(self,
                     sources,
                     output_dir=None,
                     macros=None,
                     include_dirs=None,
                     debug=0,
                     extra_preargs=None,
                     extra_postargs=None,
                     depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
        output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    N = 2 * multiprocessing.cpu_count()  # number of parallel compilations

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)

    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool(N)
    list(pool.imap(_single_compile, objects))
    return objects
Example #8
0
def parallelCCompile(self, sources, output_dir=None, macros=None,
                     include_dirs=None, debug=0, extra_preargs=None,
                     extra_postargs=None, depends=None):

    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
        output_dir, macros, include_dirs, sources, depends, extra_postargs)

    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    nthreads = 16  # number of parallel compilations
    import multiprocessing.pool

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)

    # convert to list, imap is evaluated on-demand
    multiprocessing.log_to_stderr()
    with multiprocessing.pool.ThreadPool(nthreads) as pool:
        it = pool.imap(_single_compile, objects)
        list(it)

    # list(multiprocessing.pool.ThreadPool(nthreads).imap(
    #     _single_compile, objects))

    return objects
Example #9
0
def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    N = 2*multiprocessing.cpu_count()# number of parallel compilations
    try:
        # On Unix-like platforms attempt to obtain the total memory in the
        # machine and limit the number of parallel jobs to the number of Gbs
        # of RAM (to avoid killing smaller platforms like the Pi)
        mem = os.sysconf('SC_PHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') # bytes
    except (AttributeError, ValueError):
        # Couldn't query RAM; don't limit parallelism (it's probably a well
        # equipped Windows / Mac OS X box)
        pass
    else:
        mem = max(1, int(round(mem / 1024 ** 3))) # convert to Gb
        N = min(mem, N)
    def _single_compile(obj):
        try: src, ext = build[obj]
        except KeyError: return
        newcc_args = cc_args
        if _platform == "darwin":
          if src.endswith('.cpp'):
            newcc_args = cc_args + ["-stdlib=libc++"]
        self._compile(obj, src, ext, newcc_args, extra_postargs, pp_opts)
    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool(N)
    list(pool.imap(_single_compile,objects))
    return objects
Example #10
0
def update_node_list(RequestHandler):
    threading.Timer(60*60, update_node_list, args=[RequestHandler]).start() # called every minute
    RequestHandler.lock.acquire()
    node_list = copy.deepcopy(RequestHandler.node_list)
    pool = multiprocessing.pool.ThreadPool(5)

    try:
        for node in range(0, len(node_list)):
            if node_list[node].id == RequestHandler.our_node.id:
                our_index = node
            node_list[node].path = "/node_list"
            node_list[node].method = "GET"

        pool_output = pool.imap(send_request, node_list)

        lst = []
        counter = 0
        for x in pool_output:
            if x is not None and our_index != counter:
                lst.append(RequestHandler.node_list[counter])
            counter += 1
    
        lst.append(RequestHandler.node_list[our_index])
        RequestHandler.node_list = lst
    except:
        pass
        #raise

    pool.close()
    pool.join()
    RequestHandler.lock.release()
Example #11
0
def main(args, session):
    logging.info('Deleting existing xeno-canto recordings')
    session.query(Recording).filter(Recording.source == 'xc').delete()

    fetcher = Fetcher(cache_group='xc_api',
                      pool_size=args.recording_load_jobs,
                      clear_cache=args.clear_recordings_cache)
    query = XcQuery({'nr': f'{args.start_xc_id}-{args.end_xc_id}'}, fetcher)
    first_page = query.fetch_page(1)
    num_pages = first_page['numPages']
    num_recordings = int(first_page['numRecordings'])
    logging.info(f'Found {num_pages} pages, {num_recordings} recordings')
    with multiprocessing.pool.ThreadPool(args.recording_load_jobs) as pool:
        for page in progress.percent(
                itertools.chain([first_page],
                                pool.imap(query.fetch_page,
                                          range(2, num_pages + 1))),
                num_pages):
            try:
                # Allow replacements in case the API shifts pages around
                # (it seems to do that, probably when new recordings are
                # added during the run).
                recordings = [_parse_recording(r) for r in page['recordings']]
                session.bulk_save_objects_with_replace(recordings)
            except Exception:
                logging.error(
                    f'Error parsing page:\n{json.dumps(page, indent="  ")}',
                    exc_info=True)
                raise
Example #12
0
def main(args, session):
    if args.debug_recording_ids:
        logging.info('Loading specified recordings')
        recording_ids = args.debug_recording_ids.split(',')
        recordings = session.query(Recording)\
            .filter(Recording.recording_id.in_(recording_ids))\
            .all()
        for recording in recordings:
            logging.info(f'Processing recording {recording.recording_id}')
            trim_recording(recording,
                           skip_if_exists=False,
                           skip_write=True,
                           debug_otsu_threshold=args.debug_otsu_threshold,
                           debug_utterances=args.debug_utterances)
        return

    logging.info('Loading selected recordings')
    selected_recordings = session.query(Recording).join(
        SelectedRecording).all()

    logging.info('Fetching and trimming recordings')
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python#35134329
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    with multiprocessing.pool.Pool(args.trim_recordings_process_jobs) as pool:
        signal.signal(signal.SIGINT, original_sigint_handler)
        for _output_file_name in progress.percent(
                pool.imap(_process_recording,
                          [([selected_recording], {
                              'skip_if_exists': not args.retrim_recordings
                          }) for selected_recording in selected_recordings]),
                len(selected_recordings)):
            pass
Example #13
0
def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    N = 2*multiprocessing.cpu_count()# number of parallel compilations
    try:
        # On Unix-like platforms attempt to obtain the total memory in the
        # machine and limit the number of parallel jobs to the number of Gbs
        # of RAM (to avoid killing smaller platforms like the Pi)
        mem = os.sysconf('SC_PHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') # bytes
    except (AttributeError, ValueError):
        # Couldn't query RAM; don't limit parallelism (it's probably a well
        # equipped Windows / Mac OS X box)
        pass
    else:
        mem = max(1, int(round(mem / 1024 ** 3))) # convert to Gb
        N = min(mem, N)
    def _single_compile(obj):
        try: src, ext = build[obj]
        except KeyError: return
        newcc_args = cc_args
        if _platform == "darwin":
          if src.endswith('.cpp'):
            newcc_args = cc_args + ["-stdlib=libc++"]
        self._compile(obj, src, ext, newcc_args, extra_postargs, pp_opts)
    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool(N)
    list(pool.imap(_single_compile,objects))
    return objects
Example #14
0
def as_bulk_resolve(candidates, threads=50):
    """
    Resolve a list of IPs to AS information.

    Returns a map of each result as a tuple of (ASN, owner) keyed to
    its candidate.  Returns None if no ASN could be found or (ASN,
    None) if an ASN was found but no owner is available.

    WARNING: This function will create a pool of up to 'threads'
    threads.
    """

    result = {}

    if not candidates:
        return result

    pool = multiprocessing.pool.ThreadPool(
        processes=min(len(candidates), threads))

    for ip, as_ in pool.imap(
            __asresolve__,
            candidates,
            chunksize=1):
        result[ip] = as_
    pool.close()
    return result
Example #15
0
def query_search_engine(url_list):
    pool = multiprocessing.pool.ThreadPool(processes=40)
    # results = set()
    for qname in pool.imap(worker, url_list, chunksize=1):
        # results.add(qname)
        pass
    pool.close()
def mpstarimap(func, job, **kwargs):
    job = ((func, args, kwargs) for args in job)
    pool = multiprocessing.Pool()
    for result in pool.imap(mpimap_wrapper, job):
        error = result['error']
        if error:
            raise ChildException('%r\n%s' % (error, result['traceback']))
        yield result['value']
Example #17
0
def retrieve_topics_use_inferer(paragraphs):
    # singleton
    if not hasattr(retrieve_topics_use_inferer, 'pool'):
        pool = multiprocessing.pool.Pool(processes=10)
        setattr(retrieve_topics_use_inferer, 'pool', pool)
    pool = getattr(retrieve_topics_use_inferer, 'pool')

    return list(pool.imap(infer_topics_for_paragraph, tqdm(paragraphs)))
Example #18
0
 def imap(self, f, s, chunksize=1):
     key = id(f)
     _FUNCTIONS[key] = f
     f = PicklableAndCallable(id(f))
     pool = multiprocessing.Pool(self.size, self._initWorkerProcess)
     for result in pool.imap(f, s, chunksize=chunksize):
         yield result
     del _FUNCTIONS[key]
     pool.close()
Example #19
0
 def imap(self, f, s, chunksize=1):
     key = id(f)
     _FUNCTIONS[key] = f
     f = PicklableAndCallable(id(f))
     pool = multiprocessing.Pool(self.size, self._initWorkerProcess)
     for result in pool.imap(f, s, chunksize=chunksize):
         yield result
     del _FUNCTIONS[key]
     pool.close()
    def all_reports(self, parallelism=None):
        """
        Returns an iterable of reports for all files in the database.
        """

        if parallelism is None:
            parallelism = multiprocessing.cpu_count() + 1
        pool = multiprocessing.pool.ThreadPool(processes=parallelism)

        all_names = sorted(set(list(self.found_names) + list(self.hashes.keys()) + list(self.other_data.keys())))
        return pool.imap(lambda file: self.report_for_file(file), all_names)
Example #21
0
def load_data(
    database_file,
    repo,
    package_names,
    skip_defaults,
):
    metric_parsers = get_metric_parsers_from_args(package_names, skip_defaults)

    with sqlite3.connect(database_file) as db:
        metric_mapping = get_metric_mapping(db)

        repo_parser = RepoParser(repo)

        with repo_parser.repo_checked_out():
            previous_sha = get_previous_sha(db)
            commits = repo_parser.get_commits(since_sha=previous_sha)

            # If there is nothing to check gtfo
            if len(commits) == 1 and previous_sha is not None:
                return

            # Maps metric_name to a running value
            metric_values = collections.defaultdict(int)

            # Grab the state of our metrics at the last place
            compare_commit = None
            if previous_sha is not None:
                compare_commit = commits[0]
                metric_values.update(
                    get_metric_values(
                        db,
                        compare_commit.sha,
                    ))
                commits = commits[1:]

            mp_args = six.moves.zip(
                [compare_commit] + commits,
                commits,
                itertools.repeat(repo_parser),
                itertools.repeat(metric_parsers),
            )
            pool = multiprocessing.pool.Pool(15)
            for commit, metrics in six.moves.zip(
                    commits,
                    pool.imap(_get_metrics_inner, mp_args),
            ):
                increment_metric_values(metric_values, metrics)
                insert_metric_values(
                    db,
                    metric_values,
                    metric_mapping,
                    commit,
                )
                insert_metric_changes(db, metrics, metric_mapping, commit)
Example #22
0
def render_parallel(
    paths: Iterable[pathlib.Path],
    *,
    pool: Optional[multiprocessing.pool.Pool],
    **kwargs,
):
    if not pool:
        return match.render(paths, **kwargs)

    return itertools.chain.from_iterable(
        pool.imap(functools.partial(render, **kwargs), ((p,) for p in paths))
    )
Example #23
0
def main(end_id, num_threads):
    # map the list of lines into a list of result dicts
    pool = MyPool(12)

    with open('story_list', 'r') as id_file:
        indlist = id_file.readline().strip("]").strip("[").split(', ')
    resultlist = list(
        tqdm(pool.imap(get_check_story, indlist), total=len(indlist)))
    #resultlist = pool.map(get_check_story, indlist)
    pool.close()
    with open('resultlist', 'w') as result:
        result.write(str(resultlist))
Example #24
0
def download_user_submissions(user_handle, destination, session_id):
    session = get_session(session_id)

    submissions = get_user_ok_submissions(user_handle)

    pool = multiprocessing.pool.ThreadPool(4)

    func = partial(get_source_file, session=session)

    for source_file in pool.imap(func, submissions):
        filepath = save_source_file(destination, source_file)
        if filepath:
            print 'DONE', filepath
Example #25
0
def parse_sam_in_threads(remap_csv, nthreads):
    """ Call parse_sam() in multiple processes.

    Launch a multiprocessing pool, walk through the iterator, and then be sure
    to close the pool at the end.
    """
    pool = Pool(processes=nthreads)
    try:
        reads = pool.imap(parse_sam, iterable=matchmaker(remap_csv), chunksize=100)
        for read in reads:
            yield read
    finally:
        pool.close()
        pool.join()
Example #26
0
    def all_reports(self, parallelism=None):
        """
        Returns an iterable of reports for all files in the database.
        """

        if parallelism is None:
            parallelism = multiprocessing.cpu_count() + 1
        pool = multiprocessing.pool.ThreadPool(processes=parallelism)

        all_names = sorted(
            set(
                list(self.found_names) + list(self.hashes.keys()) +
                list(self.other_data.keys())))
        return pool.imap(lambda file: self.report_for_file(file), all_names)
Example #27
0
def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
    # parallel code
    N = 2*multiprocessing.cpu_count()# number of parallel compilations
    def _single_compile(obj):
        try: src, ext = build[obj]
        except KeyError: return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool(N)
    list(pool.imap(_single_compile,objects))
    return objects
def resolve_dns(url_list):
    """Given a list of hosts, return dict that maps qname to
    returned rdata records.
    """
    response_dict = collections.defaultdict(list)
    # create pool for querys but cap max number of threads
    pool = multiprocessing.pool.ThreadPool(
        processes=min(len(url_list) * 3, 60))
    # run for all combinations of hosts and qnames
    for qname, rdatalist in pool.imap(worker,
                                      itertools.product(url_list, ('A', 'NS')),
                                      chunksize=1):
        response_dict[qname].extend(rdatalist)
    pool.close()
    return response_dict
Example #29
0
def load_all_features(root: Path, only_valid: bool, args) -> Dict[str, np.ndarray]:
    features_path = root.joinpath('features.npz')  # type: Path
    coords = utils.load_coords()
    pred_paths = list(root.glob('*-pred.npy'))
    get_id = lambda p: int(p.name.split('-')[0])
    if only_valid:
        valid_ids = {int(p.stem) for p in utils.labeled_paths()}
        pred_paths = [p for p in pred_paths if get_id(p) in valid_ids]
    if args.limit:
        pred_paths = pred_paths[:args.limit]
    if not args.new_features and features_path.exists():
        print('Loading features...')
        data = dict(np.load(str(features_path)))
        clf_features_path = root.joinpath('clf_features.npz')
        if clf_features_path.exists():
            clf_features = np.load(str(clf_features_path))['xs']
            data['xs'] = np.concatenate([data['xs'], clf_features], axis=2)
            for i in range(clf_features.shape[2]):
                feature_name = 'clf-{}'.format(i)
                ALL_FEATURE_NAMES.append(feature_name)
                FEATURE_NAMES.append(feature_name)
        print('done.')
        ids = [get_id(p) for p in pred_paths]
        assert set(ids) == set(data['ids'][0])
        return data
    print('{} total'.format(len(pred_paths)))
    data = {k: [[] for _ in range(utils.N_CLASSES)]
            for k in ['ids', 'scales', 'xs', 'ys']}
    blob_data = {k: [[] for _ in range(utils.N_CLASSES)]
                 for k in ['blobs', 'blob_ids']}
    with multiprocessing.pool.Pool(processes=24) as pool:
        for id, scale, xs, ys, blobs, blob_ids in tqdm.tqdm(
                pool.imap(partial(load_xs_ys, coords=coords), pred_paths, chunksize=2),
                total=len(pred_paths)):
            for cls in range(utils.N_CLASSES):
                data['ids'][cls].extend([id] * len(ys[cls]))
                data['scales'][cls].extend([scale] * len(ys[cls]))
                data['xs'][cls].extend(xs[cls])
                data['ys'][cls].extend(ys[cls])
                blob_data['blobs'][cls].append((id, scale, blobs[cls]))
                blob_data['blob_ids'][cls].extend(blob_ids[cls])
    data = {k: np.array(v, dtype=np.int32 if k in {'ids', 'ys'} else np.float32)
            for k, v in data.items()}
    with features_path.open('wb') as f:
        np.savez(f, **data)
    with root.joinpath('blobs.pkl').open('wb') as f:
        pickle.dump(blob_data, f)
    return data
Example #30
0
def load_data(
        database_file,
        repo,
        package_names,
        skip_defaults,
):
    metric_parsers = get_metric_parsers_from_args(package_names, skip_defaults)

    with sqlite3.connect(database_file) as db:
        metric_mapping = get_metric_mapping(db)

        repo_parser = RepoParser(repo)

        with repo_parser.repo_checked_out():
            previous_sha = get_previous_sha(db)
            commits = repo_parser.get_commits(since_sha=previous_sha)

            # If there is nothing to check gtfo
            if len(commits) == 1 and previous_sha is not None:
                return

            # Maps metric_name to a running value
            metric_values = collections.defaultdict(int)

            # Grab the state of our metrics at the last place
            compare_commit = None
            if previous_sha is not None:
                compare_commit = commits[0]
                metric_values.update(get_metric_values(
                    db, compare_commit.sha,
                ))
                commits = commits[1:]

            mp_args = six.moves.zip(
                [compare_commit] + commits,
                commits,
                itertools.repeat(repo_parser),
                itertools.repeat(metric_parsers),
            )
            pool = multiprocessing.pool.Pool(15)
            for commit, metrics in six.moves.zip(
                    commits, pool.imap(_get_metrics_inner, mp_args),
            ):
                increment_metric_values(metric_values, metrics)
                insert_metric_values(
                    db, metric_values, metric_mapping, commit,
                )
                insert_metric_changes(db, metrics, metric_mapping, commit)
Example #31
0
def parse_sam_in_threads(remap_csv, nthreads):
    """ Call parse_sam() in multiple processes.

    Launch a multiprocessing pool, walk through the iterator, and then be sure
    to close the pool at the end.
    """
    pool = Pool(processes=nthreads)
    try:
        reads = pool.imap(parse_sam,
                          iterable=matchmaker(remap_csv),
                          chunksize=100)
        for read in reads:
            yield read
    finally:
        pool.close()
        pool.join()
Example #32
0
def parallelCCompile(
    self,
    sources,
    output_dir=None,
    macros=None,
    include_dirs=None,
    debug=0,
    extra_preargs=None,
    extra_postargs=None,
    depends=None,
):
    """Build sources in parallel.

    Reference link:
    http://stackoverflow.com/questions/11013851/speeding-up-build-process-with-distutils
    Monkey-patch for parallel compilation.
    """
    # those lines are copied from distutils.ccompiler.CCompiler directly
    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
        output_dir, macros, include_dirs, sources, depends, extra_postargs
    )
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)

    if NGRAPH_PYTHON_DEBUG in ["TRUE", "ON", True]:
        try:
            # pybind11 is much more verbose without -DNDEBUG
            self.compiler.remove("-DNDEBUG")
            self.compiler.remove("-O2")
            self.compiler_so.remove("-DNDEBUG")
            self.compiler_so.remove("-O2")
        except (AttributeError, ValueError):
            pass
    # parallel code
    import multiprocessing.pool

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)

    # convert to list, imap is evaluated on-demand
    pool = multiprocessing.pool.ThreadPool()
    list(pool.imap(_single_compile, objects))
    return objects
def resolve_dns_parallel(site, resolver_ips):
    """ Given a list of resolvers ips and a site, the function send DNS query to each on of them in parallel and
    returns the ips it got from all of them.
    """
    ip_list = []
    pool = multiprocessing.pool.ThreadPool(processes=AMOUNT_OF_RESOLVERS)
    args_for_worker = [(site, resolver_ip) for resolver_ip in resolver_ips]
    # args_for_worker = np.array(([site]*len(resolver_ips), resolver_ips)).T
    try:
        for ip in pool.imap(
                worker,
                args_for_worker):
            ip_list.extend(ip)
        pool.close()
        return ip_list
    except Exception:
        print("pool exception")
Example #34
0
def main():
    max2exp = int(math.log(LIMIT) / math.log(2))
    max5exp = int(math.log(LIMIT) / math.log(5))
    multipliers = sorted(n for i in range(max2exp + 1)
                         for j in range(max5exp + 1)
                         if 1 < (n := 2**i * 5**j) <= LIMIT)
    total = 0
    with multiprocessing.pool.Pool() as pool:
        for n, order in pool.imap(
                multiplicative_order,
            (n for n in range(3, LIMIT, 2) if n % 5 != 0),
                1000,
        ):
            if n % 100_000 == 1:
                print(f"progress: {n:,}")
            i = bisect.bisect_right(multipliers, LIMIT // n)
            total += order * (i + 1)
def resolve_dns(url_list):
    """
    Given a list of hosts, return dict that maps qname to
    returned rdata records.
    """
    response_dict = collections.defaultdict(list)
    # create pool for queries but cap max number of threads
    pool = multiprocessing.pool.ThreadPool(
        processes=min(len(url_list) * 3, 60))
    for qname, rdatalist in pool.imap(
            worker,
            itertools.product(
                url_list,
                ('A', 'AAA', 'PTR', 'CNAME', 'MX', 'NS', 'TXT', 'SOA')),
            chunksize=1):
        response_dict[qname].extend(rdatalist)
    pool.close()
    #print response_dict
    return response_dict
Example #36
0
def process_wiki_dump(source, target, wiki_name, processes=None):
    if processes is None:
        processes = max(1, multiprocessing.cpu_count() - 1)
    #print(processes)

    with open(source, 'r', encoding='utf-8') as dump_file, \
         open(target, 'w', encoding='utf-8') as out_file:

        page_generator = extract_pages(dump_file,wiki_name, filter_namespaces=set(['0']))

        #for title, text, pageid in page_generator:
        #    sentences, title, pageid = process_page(title, text, pageid)
        #    for sentence in sentences:
        #        out_file.write(sentence + '\n')

        with multiprocessing.Pool(processes) as pool:
            for group in utils.chunkize(page_generator, chunksize=10 * processes, maxsize=1):
                for sentences, title, pageid in pool.imap(process_page, group):
                    for sentence in sentences:
                        out_file.write(sentence + '\n')
Example #37
0
def esi_data_by_ids(name, ids, tqdm=notqdm):
    def get_op(id):
        kwargs = {name + '_id': id}

        return esi.op['get_universe_{n}s_{n}_id'.format(n=name)](**kwargs)

    ops = [get_op(id) for id in ids]

    pool = multiprocessing.pool.ThreadPool(5)

    d = {
        id: json.loads(data)
        for id, data in zip(
            ids,
            list(
                tqdm(pool.imap(handle_request, ops, chunksize=1),
                     desc='Loading {}s'.format(name),
                     total=len(ops))))
    }

    return d
Example #38
0
def dns_bulk_resolve(candidates, reverse=False, ip_version=None, threads=50):
    """
    Resolve a list of host names to IPs or, if reverse is true, IPs to
    host names.  Return a map of each result keyed to its candidate.

    WARNING: This function will create a pool of up to 'threads'
    threads.
    """

    # This is based loosely on http://stackoverflow.com/a/34377198

    if reverse and ip_version is not None:
        raise ValueError("Unable to force IP version when reverse-resolving")

    if ip_version is None:
        ip_version = 4
    __check_ip_version__(ip_version)

    result = {}

    if len(candidates) == 0:
        return result

    # Work around a bug in 2.6
    # TODO: Get rid of this when 2.6 is no longer in the picture.
    if not hasattr(threading.current_thread(), "_children"):
        threading.current_thread()._children = weakref.WeakKeyDictionary()

    pool = multiprocessing.pool.ThreadPool(
        processes=min(len(candidates), threads) )

    candidate_args = [ (candidate, ip_version) for candidate in candidates ]

    for ip, name in pool.imap(
        __reverser__ if reverse else __forwarder__,
        candidate_args,
        chunksize=1):
        result[ip] = name
    pool.close()
    return result
Example #39
0
    def updateServicesState(self):
        try:
            hostList = self.getHostList()
            dic = ServicesParser.getServiceDic()
            pool = Pool(processes=10)
            jobList = []
            for host in hostList:
                for service in dic[host.getComponent()]:
                    if service != "":
                        jobList.append([
                            host.getAddress(), service, ' status', 'unknown',
                            host.getComponent()
                        ])

            # jobList : [ipaddress, service_name, command : status, service status(default unknown), component]
            it = pool.imap(doJob, jobList)

            serviceStateDic = {}
            for item in it:
                if item[4] not in serviceStateDic:
                    serviceStateDic[item[4]] = {item[0]: item[3]}
                else:
                    if item[0] in serviceStateDic[item[4]]:
                        if item[3] == 'stop':
                            serviceStateDic[item[4]][item[0]] = 'stop'
                        elif item[3] == 'unknown':
                            serviceStateDic[item[4]][item[0]] = 'unknown'
                    else:
                        serviceStateDic[item[4]][item[0]] = item[3]

            return serviceStateDic

        except KeyboardInterrupt:
            print("Stopping chaos...")
            sys.exit(0)
        except Exception as inst:
            logger.error(str(inspect.stack()[0][3]))
            logger.info('calling func : ' + str(inspect.stack()[1][3]) +
                        '() from ' + str(inspect.stack()[1][1]))
            logger.error(inst.args)
Example #40
0
    def updateServicesState(self):
        try:
            hostList = self.getHostList()
            dic = ServicesParser.getServiceDic()
            pool = Pool(processes=10)
            jobList = []
            for host in hostList:
                for service in dic[host.getComponent()]:
                    if service != "":
                        jobList.append([host.getAddress(), service, ' status', 'unknown', host.getComponent()])

            # jobList : [ipaddress, service_name, command : status, service status(default unknown), component]
            it = pool.imap(doJob, jobList)

            serviceStateDic = {}
            for item in it:
                if item[4] not in serviceStateDic:
                    serviceStateDic[item[4]] = {item[0]: item[3]}
                else:
                    if item[0] in serviceStateDic[item[4]]:
                        if item[3] == 'stop':
                            serviceStateDic[item[4]][item[0]] = 'stop'
                        elif item[3] == 'unknown':
                            serviceStateDic[item[4]][item[0]] = 'unknown'
                    else:
                        serviceStateDic[item[4]][item[0]] = item[3]

            return serviceStateDic

        except KeyboardInterrupt:
            print("Stopping chaos...")
            sys.exit(0)
        except Exception as inst:
            logger.error(str(inspect.stack()[0][3]))
            logger.info('calling func : ' + str(inspect.stack()[1][3]) + '() from ' + str(inspect.stack()[1][1]))
            logger.error(inst.args)
Example #41
0
def as_bulk_resolve(candidates, threads=50):
    """
    Resolve a list of IPs to AS information.

    Returns a map of each result as a tuple of (ASN, owner) keyed to
    its candidate.  Returns None if no ASN could be found or (ASN,
    None) if an ASN was found but no owner is available.

    WARNING: This function will create a pool of up to 'threads'
    threads.
    """

    result = {}

    pool = multiprocessing.pool.ThreadPool(
        processes=min(len(candidates), threads) )

    for ip, as_ in pool.imap(
        __asresolve__,
        candidates,
        chunksize=1):
        result[ip] = as_
    pool.close()
    return result
Example #42
0
def run():
    parser = argparse.ArgumentParser(
        description='Efficiently download player meta data from NFL.com. Note '
                    'that each invocation of this program guarantees at least '
                    '32 HTTP requests to NFL.com',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    aa = parser.add_argument
    aa('--json-update-file', type=str, default=None,
       help='When set, the file provided will be updated in place with new '
            'meta data from NFL.com. If this option is not set, then the '
            '"players.json" file that comes with nflgame will be updated '
            'instead.')
    aa('--simultaneous-reqs', type=int, default=3,
       help='The number of simultaneous HTTP requests sent to NFL.com at a '
            'time. Set this lower if you are worried about hitting their '
            'servers.')
    aa('--full-scan', action='store_true',
       help='Forces a full scan of nflgame player data since 2009. Typically, '
            'this is only done when starting with a fresh JSON player '
            'database. But it can be useful to re-scan all of the players if '
            'past errors went ignored and data is missing. The advantage of '
            'using this option over starting fresh is that an existing '
            '(gsis_id <-> profile_id) mapping can be used for the majority of '
            'players, instead of querying NFL.com for the mapping all over '
            'again.')
    aa('--no-block', action='store_true',
       help='When set, this program will exit with an error instead of '
            'displaying a prompt to continue. This is useful when calling '
            'this program from another script. The idea here is not to block '
            'indefinitely if something goes wrong and the program wants to '
            'do a fresh update.')
    aa('--phase', default=None, choices=['PRE', 'REG', 'POST'],
       help='Force the update to use the given phase of the season.')
    aa('--year', default=None, type=int,
       help='Force the update to use nflgame players from a specific year.')
    aa('--week', default=None, type=int,
       help='Force the update to use nflgame players from a specific week.')
    args = parser.parse_args()

    if args.json_update_file is None:
        args.json_update_file = nflgame.player._player_json_file
    teams = [team[0] for team in nflgame.teams]
    pool = multiprocessing.pool.ThreadPool(args.simultaneous_reqs)

    # Before doing anything laborious, make sure we have write access to
    # the JSON database.
    if not os.access(args.json_update_file, os.W_OK):
        eprint('I do not have write access to "%s".' % args.json_update_file)
        eprint('Without write access, I cannot update the player database.')
        sys.exit(1)

    # Fetch the initial mapping of players.
    metas, reverse = initial_mappings(args)
    if len(metas) == 0:
        if args.no_block:
            eprint('I want to do a full update, but I have been told to\n'
                   'exit instead of asking if you want to continue.')
            sys.exit(1)

        eprint("nflgame doesn't know about any players.")
        eprint("Updating player data will require several thousand HTTP HEAD "
               "requests to NFL.com.")
        eprint("It is strongly recommended to find the 'players.json' file "
               "that comes with nflgame.")
        eprint("Are you sure you want to continue? [y/n] ", end='')
        answer = raw_input()
        if answer[0].lower() != 'y':
            eprint("Quitting...")
            sys.exit(1)

    # Accumulate errors as we go. Dump them at the end.
    errors = []

    # Now fetch a set of players that aren't in our mapping already.
    # Restrict the search to the current week if we have a non-empty mapping.
    if len(metas) == 0 or args.full_scan:
        eprint('Loading players in games since 2009, this may take a while...')
        players = {}

        # Grab players one game a time to avoid obscene memory requirements.
        for _, schedule in nflgame.sched.games.itervalues():
            # If the game is too far in the future, skip it...
            if nflgame.live._game_datetime(schedule) > nflgame.live._now():
                continue
            g = nflgame.game.Game(schedule['eid'])
            for pid, name in players_from_games(metas, [g]):
                players[pid] = name
        eprint('Done.')
    else:
        year, week = nflgame.live.current_year_and_week()
        phase = nflgame.live._cur_season_phase
        if args.phase is not None:
            phase = args.phase
        if args.year is not None:
            year = args.year
        if args.week is not None:
            week = args.week

        eprint('Loading games for %s %d week %d' % (phase, year, week))
        games = nflgame.games(year, week, kind=phase)
        players = dict(players_from_games(metas, games))

    # Find the profile ID for each new player.
    if len(players) > 0:
        eprint('Finding (profile id -> gsis id) mapping for players...')

        def fetch(t):  # t[0] is the gsis_id and t[1] is the gsis name
            return t[0], t[1], profile_url(t[0])
        for i, t in enumerate(pool.imap(fetch, players.items()), 1):
            gid, name, purl = t
            pid = profile_id_from_url(purl)

            progress(i, len(players))
            if purl is None or pid is None:
                errors.append('Could not get profile URL for (%s, %s)'
                              % (gid, name))
                continue

            assert gid not in metas
            metas[gid] = {'gsis_id': gid, 'gsis_name': name,
                          'profile_url': purl, 'profile_id': pid}
            reverse[pid] = gid
        progress_done()

    # Get the soup for each team roster.
    eprint('Downloading team rosters...')
    roster = []

    def fetch(team):
        return team, roster_soup(team)
    for i, (team, soup) in enumerate(pool.imap(fetch, teams), 1):
        progress(i, len(teams))

        if soup is None:
            errors.append('Could not get roster for team %s' % team)
            continue

        tbodys = soup.find(id='result').find_all('tbody')

        for row in tbodys[len(tbodys)-1].find_all('tr'):
            try:
                roster.append(meta_from_soup_row(team, row))
            except Exception:
                errors.append(
                    'Could not get player info from roster row:\n\n%s\n\n'
                    'Exception:\n\n%s\n\n'
                    % (row, traceback.format_exc()))
    progress_done()

    # Find the gsis identifiers for players that are in the roster but haven't
    # recorded a statistic yet. (i.e., Not in nflgame play data.)
    purls = [r['profile_url']
             for r in roster if r['profile_id'] not in reverse]
    if len(purls) > 0:
        eprint('Fetching GSIS identifiers for players not in nflgame...')

        def fetch(purl):
            return purl, gsis_id(purl)
        for i, (purl, gid) in enumerate(pool.imap(fetch, purls), 1):
            progress(i, len(purls))

            if gid is None:
                errors.append('Could not get GSIS id at %s' % purl)
                continue
            reverse[profile_id_from_url(purl)] = gid
        progress_done()

    # Now merge the data from `rosters` into `metas` by using `reverse` to
    # establish the correspondence.
    for data in roster:
        gsisid = reverse.get(data['profile_id'], None)
        if gsisid is None:
            errors.append('Could not find gsis_id for %s' % data)
            continue
        merged = dict(metas.get(gsisid, {}), **data)
        merged['gsis_id'] = gsisid
        metas[gsisid] = merged

    # Finally, try to scrape meta data for players who aren't on a roster
    # but have recorded a statistic in nflgame.
    gids = [(gid, meta['profile_url'])
            for gid, meta in metas.iteritems()
            if 'full_name' not in meta and 'profile_url' in meta]
    if len(gids):
        eprint('Fetching meta data for players not on a roster...')

        def fetch(t):
            gid, purl = t
            resp, content = new_http().request(purl, 'GET')
            if resp['status'] != '200':
                if resp['status'] == '404':
                    return gid, purl, False
                else:
                    return gid, purl, None
            return gid, purl, content
        for i, (gid, purl, html) in enumerate(pool.imap(fetch, gids), 1):
            progress(i, len(gids))
            more_meta = meta_from_profile_html(html)
            if not more_meta:
                # If more_meta is False, then it was a 404. Not our problem.
                if more_meta is None:
                    errors.append('Could not fetch HTML for %s' % purl)
                continue
            metas[gid] = dict(metas[gid], **more_meta)
        progress_done()

    assert len(metas) > 0, "Have no players to add... ???"
    with open(args.json_update_file, 'w+') as fp:
        json.dump(metas, fp, indent=4, sort_keys=True,
                  separators=(',', ': '))

    if len(errors) > 0:
        eprint('\n')
        eprint('There were some errors during the download. Usually this is a')
        eprint('result of an HTTP request timing out, which means the')
        eprint('resulting "players.json" file is probably missing some data.')
        eprint('An appropriate solution is to re-run the script until there')
        eprint('are no more errors (or when the errors are problems on ')
        eprint('NFL.com side.)')
        eprint('-' * 79)
        eprint(('\n' + ('-' * 79) + '\n').join(errors))
Example #43
0
    def generate_tgen_sources(self, ext):
        if not self.build_tgen and os.path.isdir('./terraingen/gensrc'):
            return

        # get Server.exe if we don't have it already
        server_path = os.path.join(os.getcwd(), 'data', 'Server.exe')
        if os.path.isfile(server_path):
            with open(server_path, 'rb') as fp:
                server_data = fp.read()
        else:
            from urllib.request import urlopen
            print('Fetching tgen files...')
            server_data = urlopen('http://cuwo.org/get_executable.php').read()
            print('Done.')

        from terraingen.converter import convert
        print('Generating sources for tgen...')
        converter = convert(server_data)
        sources = [os.path.relpath(src) for src in converter.get_sources()]
        print('Generated %s source files.' % len(sources))

        print('Building static tgen library (this may take a while)')
        is_msvc = self.compiler.compiler_type == 'msvc'
        includes = converter.get_includes(is_msvc)
        extra_args = []
        if is_msvc:
            extra_args += ['/wd4102', '/EHsc', '/MP', '/arch:SSE2']
        else:
            extra_args += ['-w', '-fPIC', '-g0', '-march=native']

        class compile_state:
            index = 0

        # make a parallel build
        def compile_single(source):
            p = (compile_state.index * 100) // len(sources)
            p = '%02d' % p
            compile_state.index += 1
            sys.stdout.write('[%s%%] %s\n' % (p, os.path.basename(source)))
            sys.stdout.flush()
            return self.compiler.compile([source], output_dir=self.build_temp,
                                         macros=macros, include_dirs=includes,
                                         debug=self.debug,
                                         extra_postargs=extra_args)[0]

        spawn._spawn_nt = silent_spawn_nt
        old = log.set_threshold(log.WARN)
        if self.no_parallel:
            workers = 1
        else:
            workers = max(1, multiprocessing.cpu_count() - 1)
        pool = multiprocessing.pool.ThreadPool(workers)

        # convert to list, imap is evaluated on-demand
        objects = list(pool.imap(compile_single, sources))
        spawn._spawn_nt = _spawn_nt

        if os.name == 'nt':
            rsp_path = os.path.join(self.build_temp, 'link.rsp')
            fp = open(rsp_path, 'wb')
            data = ' '.join(objects).replace('\\', '/')
            fp.write(data.encode('utf-8'))
            fp.close()
            objects = ['@' + rsp_path]

        if is_msvc:
            objects += ['/NOLOGO']

        old_force = self.compiler.force
        self.compiler.force = True
        self.compiler.create_static_lib(objects, 'tgen',
                                        output_dir=os.path.relpath(lib_dir),
                                        debug=self.debug)
        self.compiler.force = old_force
        log.set_threshold(old)
Example #44
0
def headhunter( filename, keys, defaults=[], **kw ):
    #TODO: BENCHMARK! Nchunks, filesize
    #TODO: OPTIMIZE?
    #TODO:  Read first N keys from multi-ext header???
    #WARNING:  No error handeling implemented!  Use with discretion!  
    
    '''Fast extraction of keyword-value from FITS header(s) using multiprocessing and memmory mapping.
    
    Parameters
    ----------
    filename:   str
        file to search
    keys:       sequence
        keywords to search for
    defaults:   sequence
        optional defaults to substitute in case of missing keywords
    
    Keywords
    --------
    Nchunks:            nt;    default 25
        Number of chunks to split the file into.  Tweaking this number can yield faster computation
        times depending on how many cores you have.
    with_parent:        bool;   default False
        whether the key values from the parent header should be kept in the results
    return_type:        str;   options: 'raw', 'dict', 'list', 'array'
        How the results should be merged:
        raw -->         raw matched strings are returned.
        dict -->        return dict of lists keyed on keywords
        list -->        return tuple of lists
        array -->       return 2D array of data values

    Returns
    -------
    dict of lists / list of str depending on the value of `merge`     '''
    
    #print( filename )
    
    Nchunks     = kw.get( 'Nchunks', 25 )
    with_parent = kw.get( 'with_parent', False )
    return_type = kw.get( 'return_type', 'list' )
    
    assert return_type in ('raw', 'dict', 'list', 'array')
    
    if isinstance( keys, re._pattern_type ):
        matcher = keys
    else:
        if isinstance( keys, str ):
            keys = keys,
        matcher = matchmaker( *keys )
    
    chunksize = max(1, os.path.getsize(filename) // Nchunks )
    
    pool = Pool( initializer=init, initargs=[filename, matcher] )
    raw = pool.imap(extractor, getchunks(filename, chunksize) )  #chunksize=10 (can this speed things up??)
    pool.close()
    pool.join()
    
    #concatenate the list of lists into single list (a for loop is *the* fastest way of doing this!!)
    results = []
    for r in raw: 
        results.extend(r)
    
    if not with_parent:         #without parent header values (this is needed for the superheadhunter)
        ix = results.index( end_str )
        results = results[ix+1:]
    
    return merger( results, keys, defaults, return_type )