コード例 #1
0
ファイル: form10k.py プロジェクト: yunjianyang/edgar-10k-mda
    def download(self, index_path, txt_dir):
        # Save to txt dir
        self.txt_dir = txt_dir
        if not os.path.exists(self.txt_dir):
            os.makedirs(self.txt_dir)

        # Count Total Urls to Process
        with open(index_path, 'r') as fin:
            num_urls = sum(1 for line in fin)

        def iter_path_generator(index_path):

            with open(index_path, 'r') as fin:
                reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='\"',
                                    quoting=csv.QUOTE_ALL)
                for url_idx, row in enumerate(reader, 1):
                    form_type, company_name, cik, date_filed, filename = row
                    url = os.path.join(SEC_GOV_URL,
                                       filename).replace("\\", "/")
                    yield (url_idx, url)

        def download_job(obj):
            url_idx, url = obj

            fname = '_'.join(url.split('/')[-2:])

            fname, ext = os.path.splitext(fname)
            htmlname = fname + '.html'

            text_path = os.path.join(self.txt_dir, fname + '.txt')

            if os.path.exists(text_path):
                print("Already exists, skipping {}...".format(url))
                sys.stdout.write("\033[K")
            else:
                print("Total: {}, Downloading & Parsing: {}...".format(
                    num_urls, url_idx))
                sys.stdout.write("\033[K")

                r = requests.get(url)
                try:
                    # Parse html with Beautiful Soup
                    soup = BeautifulSoup(r.content, "html.parser")
                    text = soup.get_text("\n")

                    # Process Text
                    text = self._process_text(text)
                    text_path = os.path.join(self.txt_dir, fname + '.txt')

                    # Write to file
                    with codecs.open(text_path, 'w', encoding='utf-8') as fout:
                        fout.write(text)
                except BaseException as e:
                    print("{} parsing failed: {}".format(url, e))

        ncpus = cpu_count() if cpu_count() <= 8 else 8
        pool = ProcessPool(ncpus)
        pool.map(download_job, iter_path_generator(index_path))
コード例 #2
0
ファイル: __init__.py プロジェクト: pombredanne/p_tqdm
def p_uimap(function, *arrays, num_cpus=None):
    """Returns an iterator for a parallel unordered map with a progress bar.

    Args:
        function: The function to apply to each element
            of the given arrays.
        arrays: One or more arrays of the same length
            containing the data to be mapped.
        num_cpus: The number of cpus to use in parallel.
            If an int, uses that many cpus.
            If a float, uses that proportion of cpus.
            If None, uses all available cpus.
    Returns:
        An iterator which will apply the function
        to each element of the given arrays in
        parallel with a progress bar. The results
        may be in any order.
    """

    if num_cpus is None:
        num_cpus = cpu_count()
    elif type(num_cpus) == float:
        num_cpus = int(round(num_cpus * cpu_count()))

    iterator = tqdm(Pool(num_cpus).uimap(function, *arrays),
                    total=len(arrays[0]))

    return iterator
コード例 #3
0
    def __init__(self,
                 *,
                 extractor: Extractor,
                 normalizer_nld: Optional[NormalizerNLD] = None,
                 normalizer_gsf: Optional[NormalizerGSF] = None,
                 normalizer_simultan: Optional[NormalizerSimultan] = None,
                 path: Optional[Union[str, Path]] = 'saved_run/normalizers',
                 regenerate: bool = False):
        """
        Args:
            extractor (Extractor): Extractor instance
            normalizer_nld (NormalizerNLD, optional): NormalizerNLD instance
            normalizer_gsf (NormalizerGSF, optional): NormalizerGSF instance
            normalizer_simultan (NormalizerSimultan, optional):
                NormalizerSimultan instance
        """
        super().__init__(regenerate)
        self.extractor = extractor

        self.normalizer_nld = copy.deepcopy(normalizer_nld)
        self.normalizer_gsf = copy.deepcopy(normalizer_gsf)

        self.normalizer_simultan = copy.deepcopy(normalizer_simultan)

        self.nprocesses: int = cpu_count() - 1 if cpu_count() > 1 else 1

        self.res: Optional[List[ResultsNormalized]] = None

        if path is None:
            self.path = None
        else:
            self.path = Path(path)
            self.path.mkdir(exist_ok=True, parents=True)
コード例 #4
0
def _parallel(ordered: bool, function: Callable, *iterables: Iterable, **kwargs: Any) -> Generator:
    """Returns a generator for a parallel map with a progress bar.

    Arguments:
        ordered(bool): True for an ordered map, false for an unordered map.
        function(Callable): The function to apply to each element of the given Iterables.
        iterables(Tuple[Iterable]): One or more Iterables containing the data to be mapped.

    Returns:
        A generator which will apply the function to each element of the given Iterables
        in parallel in order with a progress bar.
    """

    # Extract num_cpus
    num_cpus = kwargs.pop('num_cpus', None)

    # Determine num_cpus
    if num_cpus is None:
        num_cpus = cpu_count()
    elif type(num_cpus) == float:
        num_cpus = int(round(num_cpus * cpu_count()))

    # Determine length of tqdm (equal to length of shortest iterable)
    length = min(len(iterable) for iterable in iterables if isinstance(iterable, Sized))

    # Create parallel generator
    map_type = 'imap' if ordered else 'uimap'
    pool = Pool(num_cpus)
    map_func = getattr(pool, map_type)

    for item in tqdm(map_func(function, *iterables), total=length, **kwargs):
        yield item

    pool.clear()
コード例 #5
0
ファイル: __init__.py プロジェクト: Varal7/p_tqdm
def _parallel(ordered, function, *arrays, **kwargs):
    """Returns an iterator for a parallel map with a progress bar.

    Arguments:
        ordered(bool): True for an ordered map, false for an unordered map.
        function(function): The function to apply to each element
            of the given arrays.
        arrays(tuple): One or more arrays of the same length
            containing the data to be mapped. If a non-list
            variable is passed, it will be repeated a number
            of times equal to the lengths of the list(s). If only
            non-list variables are passed, the function will be
            performed num_iter times.
        num_cpus(int): The number of cpus to use in parallel.
            If an int, uses that many cpus.
            If a float, uses that proportion of cpus.
            If None, uses all available cpus.
        num_iter(int): If only non-list variables are passed, the
            function will be performed num_iter times on
            these variables. Default: 1.

    Returns:
        An iterator which will apply the function
        to each element of the given arrays in
        parallel in order with a progress bar.
    """

    # Convert tuple to list
    arrays = list(arrays)

    # Extract kwargs
    num_cpus = kwargs.pop('num_cpus', None)
    num_iter = kwargs.pop('num_iter', 1)

    # Determine num_cpus
    if num_cpus is None:
        num_cpus = cpu_count()
    elif type(num_cpus) == float:
        num_cpus = int(round(num_cpus * cpu_count()))

    # Determine num_iter when at least one list is present
    if any([type(array) == list for array in arrays]):
        num_iter = max([len(array) for array in arrays if type(array) == list])

    # Convert single variables to lists
    # and confirm lists are same length
    for i, array in enumerate(arrays):
        if type(array) != list:
            arrays[i] = [array for _ in range(num_iter)]
        else:
            assert len(array) == num_iter

    # Create parallel iterator
    map_type = 'imap' if ordered else 'uimap'
    iterator = tqdm(getattr(Pool(num_cpus), map_type)(function, *arrays),
                    total=num_iter,
                    **kwargs)

    return iterator
コード例 #6
0
def mlp():
    if not os.path.exists(tar_dir):
        os.makedirs(tar_dir)

    iterator = glob(os.path.join(src_dir, '*.txt'))

    ncpus = cpu_count() if cpu_count() <= 8 else 8
    pool = ProcessPool(ncpus)
    pool.map(preprocess_job, iterator)
コード例 #7
0
ファイル: mp_map.py プロジェクト: XiaohaoYang/pathos
def mp_map(function, sequence, *args, **kwds):
    '''extend python's parallel map function to multiprocessing

Inputs:
    function  -- target function
    sequence  -- sequence to process in parallel

Additional Inputs:
    nproc     -- number of 'local' cpus to use  [defaut = 'autodetect']
    type      -- processing type ['blocking', 'non-blocking', 'unordered']
    threads   -- if True, use threading instead of multiprocessing
    '''
    processes = cpu_count()
    proctype = 'blocking'
    threads = False
    if kwds.has_key('nproc'):
        processes = kwds['nproc']
        kwds.pop('nproc')
        # provide a default that is not a function call
        if processes == None: processes = cpu_count()
    if kwds.has_key('type'):
        proctype = kwds['type']
        kwds.pop('type')
    if kwds.has_key('threads'):
        threads = kwds['threads']
        kwds.pop('threads')
    # remove all the junk kwds that are added due to poor design!
    if kwds.has_key('nnodes'): kwds.pop('nnodes')
    if kwds.has_key('nodes'): kwds.pop('nodes')
    if kwds.has_key('launcher'): kwds.pop('launcher')
    if kwds.has_key('mapper'): kwds.pop('mapper')
    if kwds.has_key('queue'): kwds.pop('queue')
    if kwds.has_key('timelimit'): kwds.pop('timelimit')
    if kwds.has_key('scheduler'): kwds.pop('scheduler')
    if kwds.has_key('ncpus'): kwds.pop('ncpus')
    if kwds.has_key('servers'): kwds.pop('servers')

    if proctype in ['blocking']:
        if not threads:
            return mp.map(function,sequence,*args,**kwds)
        else:
            return tp.map(function,sequence,*args,**kwds)
    elif proctype in ['unordered']:
        if not threads:
            return mp.uimap(function,sequence,*args,**kwds)
        else:
            return tp.uimap(function,sequence,*args,**kwds)
    elif proctype in ['non-blocking', 'ordered']:
        if not threads:
            return mp.imap(function,sequence,*args,**kwds)
        else:
            return tp.imap(function,sequence,*args,**kwds)
    # default
    if not threads:
        return mp.map(function,sequence,*args,**kwds)
    else:
        return tp.map(function,sequence,*args,**kwds)
コード例 #8
0
def mp_map(function, sequence, *args, **kwds):
    '''extend python's parallel map function to multiprocessing

Inputs:
    function  -- target function
    sequence  -- sequence to process in parallel

Additional Inputs:
    nproc     -- number of 'local' cpus to use  [defaut = 'autodetect']
    type      -- processing type ['blocking', 'non-blocking', 'unordered']
    threads   -- if True, use threading instead of multiprocessing
    '''
    processes = cpu_count()
    proctype = 'blocking'
    threads = False
    if kwds.has_key('nproc'):
        processes = kwds['nproc']
        kwds.pop('nproc')
        # provide a default that is not a function call
        if processes == None: processes = cpu_count()
    if kwds.has_key('type'):
        proctype = kwds['type']
        kwds.pop('type')
    if kwds.has_key('threads'):
        threads = kwds['threads']
        kwds.pop('threads')
    # remove all the junk kwds that are added due to poor design!
    if kwds.has_key('nnodes'): kwds.pop('nnodes')
    if kwds.has_key('nodes'): kwds.pop('nodes')
    if kwds.has_key('launcher'): kwds.pop('launcher')
    if kwds.has_key('mapper'): kwds.pop('mapper')
    if kwds.has_key('queue'): kwds.pop('queue')
    if kwds.has_key('timelimit'): kwds.pop('timelimit')
    if kwds.has_key('scheduler'): kwds.pop('scheduler')
    if kwds.has_key('ncpus'): kwds.pop('ncpus')
    if kwds.has_key('servers'): kwds.pop('servers')

    if proctype in ['blocking']:
        if not threads:
            return mp.map(function, sequence, *args, **kwds)
        else:
            return tp.map(function, sequence, *args, **kwds)
    elif proctype in ['unordered']:
        if not threads:
            return mp.uimap(function, sequence, *args, **kwds)
        else:
            return tp.uimap(function, sequence, *args, **kwds)
    elif proctype in ['non-blocking', 'ordered']:
        if not threads:
            return mp.imap(function, sequence, *args, **kwds)
        else:
            return tp.imap(function, sequence, *args, **kwds)
    # default
    if not threads:
        return mp.map(function, sequence, *args, **kwds)
    else:
        return tp.map(function, sequence, *args, **kwds)
コード例 #9
0
    def extract(self):
        def text_gen(txt_dir):
            # Yields markup & name
            for fname in os.listdir(txt_dir):
                if not fname.endswith('.txt'):
                    continue
                yield fname

        def parsing_job(fname):
            print("Parsing: {}".format(fname))
            # Read text
            filepath = os.path.join(self.txt_dir, fname)
            with codecs.open(filepath, 'rb', encoding='utf-8') as fin:
                text = fin.read()

            name, ext = os.path.splitext(fname)
            # Parse MDA part

            msg = ""
            mda, end = self.parse_mda(text)
            # Parse second time if first parse results in index
            if mda and len(mda.encode('utf-8')) < 1000:
                mda, _ = self.parse_mda(text, start=end)

            if mda:  # Has value
                msg = "SUCCESS"
                mda_path = os.path.join(self.mda_dir, name + '.mda')
                with codecs.open(mda_path, 'w', encoding='utf-8') as fout:
                    fout.write(mda)
            else:
                msg = msg if mda else "MDA NOT FOUND"
            print("{},{}".format(name, msg))
            return name + '.txt', msg  #

        ncpus = cpu_count() if cpu_count() <= 8 else 8
        pool = ProcessPool(ncpus)

        _start = time.time()
        parsing_failed = pool.map( parsing_job, \
                                   text_gen(self.txt_dir) )
        _end = time.time()

        print("MDA parsing time taken: {} seconds.".format(_end - _start))

        # Write failed parsing list
        count = 0
        parsing_log = 'parsing.log'
        with open(parsing_log, 'w') as fout:
            print("Writing parsing results to {}".format(parsing_log))
            for name, msg in parsing_failed:
                fout.write('{},{}\n'.format(name, msg))
                if msg != "SUCCESS":
                    count = count + 1

        print("Number of failed text:{}".format(count))
コード例 #10
0
    def __init__(self, raw: Optional[Matrix] = None,
                 bg: Optional[Matrix] = None,
                 bg_ratio: float = 1,
                 path: Optional[Union[str, Path]] = 'saved_run/ensemble'):
        """ Sets up attributes and loads a saved ensemble if provided.

        Args:
            raw: The model matrix to peturbate. If a background is provided,
                this is the "prompt+bg" matrix.
            bg: Background matrix to subtract.
            bg_ratio: Prompt is obtainied by `raw - bg_ratio * bg`. Defaults to
                1. This is the case for equal time gate length of `prompt+bg`
                and `bg`.
            path: The path where to save the ensemble. If set,
                the ensemble will try to load from the path, but will
                fail *silently* if it is unable to. It is recommended to call
                load([path]) explicitly.
        """
        self.raw: Optional[Matrix] = raw
        self.bg: Optional[Matrix] = bg
        self.bg_ratio: Optional[float] = bg_ratio
        self.prompt_w_bg: Optional[Matrix] = raw

        self.unfolder: Optional[Callable[[Matrix], Matrix]] = None
        self.first_generation_method: \
            Optional[Callable[[Matrix], Matrix]] = None
        self.size = 0
        self.regenerate = False
        self.action_prompt_w_bg = Action('matrix')
        self.action_bg = Action('matrix')
        self.action_raw = Action('matrix')
        self.action_unfolded = Action('matrix')
        self.action_firstgen = Action('matrix')

        self.std_raw: Optional[Matrix] = None
        self.std_unfolded: Optional[Matrix] = None
        self.std_firstgen: Optional[Matrix] = None

        self.raw_ensemble: Optional[Matrix] = None
        self.unfolded_ensemble: Optional[Matrix] = None
        self.firstgen_ensemble: Optional[Matrix] = None

        self.seed: int = 987654
        self.nprocesses: int = cpu_count()-1 if cpu_count() > 1 else 1

        if path is None:
            self.path = None
        else:
            self.path = Path(path)
            self.path.mkdir(exist_ok=True, parents=True)

        self.raw.state = "raw"
コード例 #11
0
    def __init__(self, *args, **kwds):
        """\nNOTE: if number of nodes is not given, will autodetect processors
        """
        hasnodes = 'nodes' in kwds
        arglen = len(args)
        if 'ncpus' in kwds and (hasnodes or arglen):
            msg = "got multiple values for keyword argument 'ncpus'"
            raise TypeError(msg)
        elif hasnodes:  #XXX: multiple try/except is faster?
            if arglen:
                msg = "got multiple values for keyword argument 'nodes'"
                raise TypeError(msg)
            kwds['ncpus'] = kwds.pop('nodes')
        elif arglen:
            kwds['ncpus'] = args[0]
        self.__nodes = kwds.get('ncpus', cpu_count())

        # Create an identifier for the pool
        self._id = kwds.get('id', None)  #'pool'
        if self._id is None:
            self._id = self.__nodes

        # Create a new server if one isn't already initialized
        self._serve()
        return
コード例 #12
0
    def upload_to_es(self, parallel=True, reindex_nested_features=False):
        """
        Loops through each file, loads in dictionaries, combines, and uploads to elastic search

        inputs:
            reindex_nested_features (bool): optional boolean to reindex features for nested search
        """
        if not parallel:
            for x in self.fulltext:
                if reindex_nested_features:
                    self.upload_single_file_with_reindex(x)
                else:
                    self.upload_single_file(x)
        else:
            # start multithreading pool
            processors = cpu_count()
            pool = pp.ProcessPool(processors)

            if reindex_nested_features:
                r = list(
                    tqdm(pool.imap(self.upload_single_file_with_reindex,
                                   self.fulltext),
                         total=len(self.fulltext)))
            else:
                r = list(
                    tqdm(pool.imap(self.upload_single_file, self.fulltext),
                         total=len(self.fulltext)))

            pool.close()
            pool.join()
コード例 #13
0
ファイル: parallel.py プロジェクト: Michael0x2a/pathos
 def _serve(self, nodes=None, servers=None): #XXX: is a STATE method; use id
     """Create a new server if one isn't already initialized""" 
     # get nodes and servers in form used by pp.Server
     if nodes is None: nodes = self.nodes #XXX: autodetect must be explicit
     if nodes in ['*']: nodes = 'autodetect'
     if servers is None:
         servers = tuple(sorted(self.__servers)) # no servers is ()
     elif servers in ['*', 'autodetect']: servers = ('*',)
     # if no server, create one
     _pool = __STATE.get(self._id, None)
     if not _pool:
         _pool = pp.Server(ppservers=servers)
     # convert to form returned by pp.Server, then compare
     _auto = [('*',)] if _pool.auto_ppservers else []
     _servers = sorted(_pool.ppservers + _auto)
     _servers = tuple(':'.join((str(i) for i in tup)) for tup in _servers)
     if servers != _servers: #XXX: assume servers specifies ports if desired
         _pool = pp.Server(ppservers=servers)
     # convert to form returned by pp.Server, then compare
     _nodes = cpu_count() if nodes=='autodetect' else nodes
     if _nodes != _pool.get_ncpus():
         _pool.set_ncpus(nodes) # allows ncpus=0
     # set (or 'repoint') the server
     __STATE[self._id] = _pool
     # set the 'self' internals
     self.__nodes = None if nodes in ['autodetect'] else nodes
     self.__servers = servers
     return _pool
コード例 #14
0
    def write_image_name_on_images_in_dir_parallel(img_files,
                                                   input_folder_path,
                                                   output_folder_path,
                                                   override=False,
                                                   xy=(0, 0),
                                                   text_color=(255, 255, 255),
                                                   font_size=40,
                                                   background_color=(0, 0, 0),
                                                   number_gpus=6):
        logger.info('write_image_name_on_images_in_dir_parallel: ...')
        logger.info('Using ' + str(number_gpus) + ' of ' + str(cpu_count()) + 'CPUs')

        from Utility.List_Extension import ListExtension
        chunks = ListExtension.split_list_in_n_parts(img_files, number_gpus)

        with ProcessingPool() as pool:

            results = []
            for gpu_id in range(number_gpus):
                result = pool.apipe(
                    ImageFileHandler.write_image_name_on_images_in_dir_sequential,
                    *[chunks[gpu_id], input_folder_path, output_folder_path, override, xy, text_color, font_size, background_color]
                )
                results.append(result)

            # Collect the asynchronous calls
            for result in results:
                result.get()
        logger.info('write_image_name_on_images_in_dir_parallel: Done')
コード例 #15
0
    def __init__(self, model, step_finish, args = None, split = 0, buffer = 6,\
                 recombine = None, recombine_args = None, verbose = False, \
                 boundary_pass = 1):

        self.model = self.grid_adjust(model)
        #add in function to finish steps
        self.step_finish = step_finish
        self.step_args = args

        #Get the number of CPUs unless user specified
        if split == 0:
            self.ncpus = cpu_count()
        else:
            self.ncpus = split

        #create the number of process available
        #self.pool = ProcessPool(nodes = self.ncpus)
        self.buffer = buffer

        self.multi_models = collections.OrderedDict()
        #dictionary to track when all steps on each processor complete
        self.sync_status = collections.OrderedDict()
        #print (self.pool)

        #add ability for user to deconflict
        self.boundary_pass = boundary_pass

        if recombine == None:
            self.recombine = self.recombine_default
            self.recombine_args = recombine_args
        else:
            self.recombine = recombine
            self.recombine_args = recombine_args

        self.verbose = verbose
コード例 #16
0
 def _serve(self,
            nodes=None,
            servers=None):  #XXX: is a STATE method; use id
     """Create a new server if one isn't already initialized"""
     # get nodes and servers in form used by pp.Server
     if nodes is None: nodes = self.nodes  #XXX: autodetect must be explicit
     if nodes in ['*']: nodes = 'autodetect'
     if servers is None:
         servers = tuple(sorted(self.__servers))  # no servers is ()
     elif servers in ['*', 'autodetect']:
         servers = ('*', )
     # if no server, create one
     _pool = __STATE.get(self._id, None)
     if not _pool:
         _pool = pp.Server(ppservers=servers)
     # convert to form returned by pp.Server, then compare
     _auto = [('*', )] if _pool.auto_ppservers else []
     _servers = sorted(_pool.ppservers + _auto)
     _servers = tuple(':'.join((str(i) for i in tup)) for tup in _servers)
     if servers != _servers:  #XXX: assume servers specifies ports if desired
         _pool = pp.Server(ppservers=servers)
     # convert to form returned by pp.Server, then compare
     _nodes = cpu_count() if nodes == 'autodetect' else nodes
     if _nodes != _pool.get_ncpus():
         _pool.set_ncpus(nodes)  # allows ncpus=0
     # set (or 'repoint') the server
     __STATE[self._id] = _pool
     # set the 'self' internals
     self.__nodes = None if nodes in ['autodetect'] else nodes
     self.__servers = servers
     return _pool
コード例 #17
0
ファイル: run.py プロジェクト: Pierek/DownstreamSender
def main_jagcat():
    if os.path.exists('log\\' + GeneralConfig.ENV + '_' +
                      GeneralConfig.TOPIC_PREFIX + '_' +
                      GeneralConfig.ROWCOUNT_LOG_FILE):
        os.remove('log\\' + GeneralConfig.ENV + '_' +
                  GeneralConfig.TOPIC_PREFIX + '_' +
                  GeneralConfig.ROWCOUNT_LOG_FILE)

    pool = ProcessPool(nodes=cpu_count() - 1 or 1)

    pool.amap(send_part_meta, [PartMetaConfig.TOPIC],
              [PartMetaConfig.KAFKA_KEY])
    pool.amap(send_intray, [IntrayConfig.TOPIC], [IntrayConfig.KAFKA_KEY])
    pool.amap(send_description, [DescriptionConfig.TOPIC],
              [DescriptionConfig.KAFKA_KEY])
    pool.amap(send_feature, [FeatureConfig.TOPIC], [FeatureConfig.KAFKA_KEY])
    pool.amap(send_feature_family, [FeatureFamilyConfig.TOPIC],
              [FeatureFamilyConfig.KAFKA_KEY])
    pool.amap(send_hierarchy, [HierarchyConfig.TOPIC],
              [HierarchyConfig.KAFKA_KEY])
    pool.amap(send_hierarchy_illustration, [HierarchyIllustrationConfig.TOPIC],
              [HierarchyIllustrationConfig.KAFKA_KEY])
    pool.amap(send_hierarchy_usage, [HierarchyUsageConfig.TOPIC],
              [HierarchyUsageConfig.KAFKA_KEY])
    pool.amap(send_section_callout, [SectionCalloutConfig.TOPIC],
              [SectionCalloutConfig.KAFKA_KEY])
    pool.amap(send_section_part_usage, [SectionPartUsageConfig.TOPIC],
              [SectionPartUsageConfig.KAFKA_KEY])
    pool.amap(send_vin, [VinConfig.TOPIC], [VinConfig.KAFKA_KEY])

    pool.close()
    pool.join()
コード例 #18
0
ファイル: run.py プロジェクト: Pierek/DownstreamSender
def main():
    if os.path.exists('log\\' + GeneralConfig.ENV + '_' +
                      GeneralConfig.TOPIC_PREFIX + '_' +
                      GeneralConfig.ROWCOUNT_LOG_FILE):
        os.remove('log\\' + GeneralConfig.ENV + '_' +
                  GeneralConfig.TOPIC_PREFIX + '_' +
                  GeneralConfig.ROWCOUNT_LOG_FILE)

    pool = ProcessPool(nodes=cpu_count() - 1 or 1)

    # pool.amap(send_aftermarket_part, [AftermarketPartConfig.TOPIC], [AftermarketPartConfig.KAFKA_KEY])
    # pool.amap(send_description, [DescriptionConfig.TOPIC], [DescriptionConfig.KAFKA_KEY])
    # pool.amap(send_engineering_part, [EngineeringPartConfig.TOPIC], [EngineeringPartConfig.KAFKA_KEY])
    # pool.amap(send_engineering_part_function, [EngineeringPartFunctionConfig.TOPIC], [EngineeringPartFunctionConfig.KAFKA_KEY])
    # pool.amap(send_engineering_part_usage, [EngineeringPartUsageConfig.TOPIC], [EngineeringPartUsageConfig.KAFKA_KEY])
    # pool.amap(send_feature, [FeatureConfig.TOPIC], [FeatureConfig.KAFKA_KEY])
    # pool.amap(send_feature_family, [FeatureFamilyConfig.TOPIC], [FeatureFamilyConfig.KAFKA_KEY])
    pool.amap(send_hierarchy, [HierarchyConfig.TOPIC],
              [HierarchyConfig.KAFKA_KEY])
    # pool.amap(send_hierarchy_illustration, [HierarchyIllustrationConfig.TOPIC], [HierarchyIllustrationConfig.KAFKA_KEY])
    # pool.amap(send_hierarchy_usage, [HierarchyUsageConfig.TOPIC], [HierarchyUsageConfig.KAFKA_KEY])
    # pool.amap(send_section_callout, [SectionCalloutConfig.TOPIC], [SectionCalloutConfig.KAFKA_KEY])
    # pool.amap(send_section_part_usage, [SectionPartUsageConfig.TOPIC], [SectionPartUsageConfig.KAFKA_KEY])
    # pool.amap(send_supersession, [SupersessionConfig.TOPIC], [SupersessionConfig.KAFKA_KEY])
    # pool.amap(send_intray, [IntrayConfig.TOPIC], [IntrayConfig.KAFKA_KEY])
    # pool.amap(send_vin, [VinConfig.TOPIC], [VinConfig.KAFKA_KEY])

    pool.close()
    pool.join()
コード例 #19
0
ファイル: form10k.py プロジェクト: tgarutti/edgar-10k-sa
    def download(self, index_path):

        def iter_path_generator(index_path):
            with open(index_path,'r') as fin:
                reader = csv.reader(fin,delimiter=',',quotechar='\"',quoting=csv.QUOTE_ALL)
                for row in reader:
                    form_type, company_name, cik, date_filed, filename = row
                    url = os.path.join(SEC_GOV_URL,filename)
                    yield url

        def download_job(url):
            fname = '_'.join(url.split('/')[-2:])

            fname, ext = os.path.splitext(fname)
            htmlname = fname + '.html'

            text_path = os.path.join(self.txt_dir,fname + '.txt')

            if os.path.exists(text_path):
                print("Already exists, skipping {}".format(url))
            else:
                print("Downloading & Parsing {}".format(url))

                r = requests.get(url)
                try:
                    # Parse html with Beautiful Soup
                    soup = BeautifulSoup( r.content, "html.parser" )
                    text = soup.get_text("\n")

                    # Process Text
                    text = self._process_text(text)

                    text_path = os.path.join(self.txt_dir,fname + '.txt')
                    # Write to file
                    with codecs.open(text_path,'w',encoding='utf-8') as fout:
                        fout.write(text)
                except BaseException as e:
                    print("{} parsing failed: {}".format(url,e))


        ncpus = cpu_count() if cpu_count() <= 8 else 8;
        pool = ProcessPool( ncpus )
        pool.map( download_job,
                    iter_path_generator(index_path) )
コード例 #20
0
ファイル: parallel.py プロジェクト: Michael0x2a/pathos
 def _equals(self, server):
     "check if the server is compatible"
     if not server:
         return False
     _nodes = cpu_count() if self.__nodes is None else self.__nodes
     if _nodes != server.get_ncpus():
         return False
     _auto = [('*',)] if server.auto_ppservers else []
     _servers = sorted(server.ppservers + _auto)
     _servers = [':'.join((str(i) for i in tup)) for tup in _servers]
     return sorted(self.__servers) == _servers
コード例 #21
0
 def _equals(self, server):
     "check if the server is compatible"
     if not server:
         return False
     _nodes = cpu_count() if self.__nodes is None else self.__nodes
     if _nodes != server.get_ncpus():
         return False
     _auto = [('*', )] if server.auto_ppservers else []
     _servers = sorted(server.ppservers + _auto)
     _servers = [':'.join((str(i) for i in tup)) for tup in _servers]
     return sorted(self.__servers) == _servers
コード例 #22
0
ファイル: multiprocessing.py プロジェクト: XiaohaoYang/pathos
    def __init__(self, *args, **kwds):
        """\nNOTE: if number of nodes is not given, will autodetect processors
        """
        hasnodes = kwds.has_key('nodes'); arglen = len(args)
        if kwds.has_key('ncpus') and (hasnodes or arglen):
            msg = "got multiple values for keyword argument 'ncpus'"
            raise TypeError, msg
        elif hasnodes: #XXX: multiple try/except is faster?
            if arglen:
                msg = "got multiple values for keyword argument 'nodes'"
                raise TypeError, msg
            kwds['ncpus'] = kwds.pop('nodes')
        elif arglen:
            kwds['ncpus'] = args[0]
        self.__nodes = kwds.get('ncpus', cpu_count())

        # Create a new server if one isn't already initialized
        if not __STATE['pool'] or self.__nodes != cpu_count():
            __STATE['pool'] = Pool(self.__nodes)
        return
コード例 #23
0
    def run_qaqc(self, las_paths):
        if self.config.multiprocess:
            p = pp.ProcessPool(max(int(ph.cpu_count() / 2), 1))
            num_las = len(las_paths)
            for _ in tqdm(p.imap(self.run_qaqc_checks_multiprocess, las_paths), total=num_las, ascii=True):
                pass

            p.close()
            p.join()
            p.clear()
        else:
            self.run_qaqc_checks(las_paths)
コード例 #24
0
def avaliacao(populacao):
    x = valores(populacao)
    n = len(populacao)
    def steps(k):
        sequence = x[k, :]
        t =  lm.move(startpoint, sequence=sequence)
        return t


    peso = None
    ncpu = cpu_count()
    with Pool(ncpu) as pool:
        peso = array(pool.map(steps, range(n)))

    return peso
コード例 #25
0
def test_pathos_pp_callable () :
    """Test parallel processnig with pathos: ParallelPool  
    """
    logger = getLogger("ostap.test_pathos_pp_callable")         
    if not pathos :
        logger.error ( "pathos is not available" )
        return
    
    logger.info ('Test job submission with %s' %  pathos ) 
    
    if DILL_PY3_issue : 
        logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" )
        return

    ## logger.warning ("test is disabled for UNKNOWN REASON")
    ## return

    from pathos.helpers import cpu_count
    ncpus = cpu_count  ()
    
    from pathos.pools import ParallelPool as Pool 

    pool = Pool ( ncpus )   
    logger.info ( "Pool is %s" %  ( type ( pool ).__name__ ) )

    pool.restart ( True ) 


    mh   = MakeHisto() 
    jobs = pool.uimap ( mh.process ,  [  ( i , n )  for  ( i , n ) in enumerate ( inputs ) ] )
    
    result = None 
    for h in progress_bar ( jobs , max_value = len ( inputs ) ) :
        if not result  : result = h
        else           : result.Add ( h )

    pool.close ()
    pool.join  ()
    pool.clear ()
    
    logger.info ( "Histogram is %s" % result.dump ( 80 , 10 )  )
    logger.info ( "Entries  %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) 
    
    with wait ( 1 ) , use_canvas ( 'test_pathos_pp_callable' ) : 
        result.draw (   ) 

    return result 
コード例 #26
0
ファイル: pp.py プロジェクト: darylchang/CS224U-Project
 def _clear(self): #XXX: should be STATE method; use id
     """Remove server with matching state"""
     _pool = __STATE['server']
     if not _pool:
         return
     # convert to form returned by pp.Server, then compare
     _nodes = cpu_count() if self.__nodes is None else self.__nodes
     if _nodes != _pool.get_ncpus():
         return
     _auto = [('*',)] if _pool.auto_ppservers else []
     _servers = sorted(_pool.ppservers + _auto)
     _servers = [':'.join((str(i) for i in tup)) for tup in _servers]
     if sorted(self.__servers) != _servers:
         return
     # it's the 'same' (better to check _pool.secret?)
     __STATE['server'] = None
     return #_pool
コード例 #27
0
ファイル: pp.py プロジェクト: lelou13/CS224U-Project
 def _clear(self):  #XXX: should be STATE method; use id
     """Remove server with matching state"""
     _pool = __STATE['server']
     if not _pool:
         return
     # convert to form returned by pp.Server, then compare
     _nodes = cpu_count() if self.__nodes is None else self.__nodes
     if _nodes != _pool.get_ncpus():
         return
     _auto = [('*', )] if _pool.auto_ppservers else []
     _servers = sorted(_pool.ppservers + _auto)
     _servers = [':'.join((str(i) for i in tup)) for tup in _servers]
     if sorted(self.__servers) != _servers:
         return
     # it's the 'same' (better to check _pool.secret?)
     __STATE['server'] = None
     return  #_pool
コード例 #28
0
    def avaliacao(self, populacao):

        n = len(populacao)

        def steps(k):
            individuo = populacao[k, :]
            obj = self.funcao_objetivo(individuo)
            return obj

        ncpu = cpu_count()
        pool = ProcessPool(nodes=ncpu)
        pesos = array(pool.map(steps, range(n)))
        pool.close()
        pool.join()
        pool.clear()
        shutdown()

        return pesos
コード例 #29
0
ファイル: launchers.py プロジェクト: hpparvi/pyina
    def __init__(self, *args, **kwds):
        """\nNOTE: if number of nodes is not given, will try to grab the number
of nodes from the associated scheduler, and failing will count the local cpus.
If workdir is not given, will default to scheduler's workdir or $WORKDIR.
If scheduler is not given, will default to only run on the current node.
If pickle is not given, will attempt to minimially use TemporaryFiles.

For more details, see the docstrings for the "map" method, or the man page
for the associated launcher (e.g mpirun, mpiexec).
        """
        Mapper.__init__(self, *args, **kwds)
        self.scatter = bool(kwds.get('scatter', False)) #XXX: hang w/ nodes=1 ?
       #self.nodes = kwds.get('nodes', None)
        if not len(args) and not kwds.has_key('nodes'):
            if self.scheduler:
                self.nodes = self.scheduler.nodes
            else:
                self.nodes = cpu_count()
        return
コード例 #30
0
    def __init__(self, *args, **kwds):
        """\nNOTE: if number of nodes is not given, will autodetect processors
        """
        hasnodes = kwds.has_key('nodes')
        arglen = len(args)
        if kwds.has_key('ncpus') and (hasnodes or arglen):
            msg = "got multiple values for keyword argument 'ncpus'"
            raise TypeError, msg
        elif hasnodes:  #XXX: multiple try/except is faster?
            if arglen:
                msg = "got multiple values for keyword argument 'nodes'"
                raise TypeError, msg
            kwds['ncpus'] = kwds.pop('nodes')
        elif arglen:
            kwds['ncpus'] = args[0]
        self.__nodes = kwds.get('ncpus', cpu_count())

        # Create a new server if one isn't already initialized
        self._serve()
        return
コード例 #31
0
ファイル: mp_pathos.py プロジェクト: mazurov/ostap
 def __init__( self, ncpus='autodetect', ppservers=None , silent = False ) :
     
     if ncpus == 'autodetect' :
         from pathos.helpers import cpu_count
         self.ncpus = cpu_count()
     else :                     self.ncpus = ncpus
     if ppservers :
         self._ppservers = ppservers
         self.sessions   =  [ ppServer(srv) for srv in ppservers ]
         self.ppservers  = tuple ( [ i.local_server for i in self.sessions ] )
         from pathos.parallel import ParallelPool as PPPool            
         self.pool       = PPPool( ncpus = self.ncpus , ppservers=self.ppservers)
         self.mode       = 'cluster'
         from pathos.parallel import stats as pp_stats
         self.pp_stats   = pp_stats
     else :
         from pathos.multiprocessing import ProcessPool as MPPool
         self.pool = MPPool(self.ncpus)
         self.mode = 'multicore'
     self.stats  = {}
     self.silent = True if silent else  False 
コード例 #32
0
    def __init__(self, model, step_finish, args = None, split = 0, buffer = 6,\
                 recombine = None,recombine_args = None, verbose = False, \
                 boundary_pass = 1):

        #Creates an instance of the model changing the Mesa space to the
        #distributed space
        self.model = self.grid_adjust(model)
        #add in function to finish steps
        self.step_finish = step_finish
        #optional arguments for step_finish function
        self.step_args = args

        #Get the number of CPUs unless user specified
        if split == 0:
            self.ncpus = cpu_count()
        else:
            self.ncpus = split

        #create the number of process available
        self.pool = ProcessPool(nodes=self.ncpus)
        self.pipes = self.pipe_setup(self.ncpus)
        #buffer size
        self.buffer = buffer
        self.multi_models = collections.OrderedDict()
        #dictionary to track when all steps on each processor complete
        self.sync_status = collections.OrderedDict()
        #add ability for user to deconflict
        self.boundary_pass = boundary_pass

        #Either use the default recombine function or the user adds their own
        if recombine == None:
            self.recombine = self.recombine_default
            self.recombine_args = recombine_args
        else:
            self.recombine = recombine
            self.recombine_args = recombine_args
        #if True tells the user what is being recombined and what is being
        #ignored
        self.verbose = verbose
コード例 #33
0
    def __init__(self, *args, **kwds):
        """\nNOTE: if number of nodes is not given, will autodetect processors
        """
        hasnodes = 'nodes' in kwds; arglen = len(args)
        if 'ncpus' in kwds and (hasnodes or arglen):
            msg = "got multiple values for keyword argument 'ncpus'"
            raise TypeError(msg)
        elif hasnodes: #XXX: multiple try/except is faster?
            if arglen:
                msg = "got multiple values for keyword argument 'nodes'"
                raise TypeError(msg)
            kwds['ncpus'] = kwds.pop('nodes')
        elif arglen:
            kwds['ncpus'] = args[0]
        self.__nodes = kwds.get('ncpus', cpu_count())

        # Create an identifier for the pool
        self._id = 'pool'

        # Create a new server if one isn't already initialized
        self._serve()
        return
コード例 #34
0
def test_pathos_mp_function () :
    """Test parallel processnig with pathos: ProcessPool
    """
    logger = getLogger("ostap.test_pathos_mp_function")
    if not pathos :
        logger.error ( "pathos is not available" )
        return 
    
    logger.info ('Test job submission with %s' %  pathos ) 
    
    if DILL_PY3_issue : 
        logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" )
        return
    
    from pathos.helpers import cpu_count
    ncpus = cpu_count  ()
    
    from pathos.pools import ProcessPool as Pool

    pool = Pool ( ncpus )
    logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) )

    with pool_context   ( pool ) : 
        
        jobs = pool.uimap ( make_histo ,  zip ( count() , inputs ) )
        
        result = None 
        for h in progress_bar ( jobs , max_value = len ( inputs ) ) :
            if not result  : result = h
            else           : result.Add ( h )
                
    logger.info ( "Histogram is %s" % result.dump ( 80 , 10 )  )
    logger.info ( "Entries  %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) 
    
    with wait ( 1 ) , use_canvas ( 'test_pathos_mp_function' ) : 
        result.draw (   ) 

    return result 
コード例 #35
0
    def get_returns(tickers, folder=None, freq='d', fromdate='2000-01-01',
                    todate='2018-12-31', forward_bars=None, data_col=None,
                    is_log=False, is_debug=False, is_multiprocess=False):
        if data_col is None:
            data_col = 'Adj Close'

        res = {}
        tqdm, ascii = get_tqdm()
        logger = get_logger()

        logger.info('Loading Yahoo Labels...')

        def _get_return(ticker, res=res, folder=folder, freq=freq,
                        fromdate=fromdate, todate=todate, is_debug=is_debug,
                        forward_bars=forward_bars, data_col=data_col,
                        is_log=is_log):
            if is_debug:
                logger = get_logger()
                logger.info('Loading %s' % ticker)
            res[ticker] = YahooProcessor._get_return(ticker, folder, freq,
                                                     fromdate, todate,
                                                     forward_bars, data_col,
                                                     is_log)
            return ticker, res[ticker]

        if is_multiprocess:
            logger.info('Initalized Multiprocess To Get Returns...')
            with Pool(cpu_count()) as p:
                res_pool = list(tqdm(p.imap(_get_return, tickers),
                                total=len(tickers), ascii=ascii))
            res = {item[0]: item[1] for item in res_pool}

        else:
            list(tqdm(map(_get_return, tickers), total=len(tickers),
                 ascii=ascii))

        return res
コード例 #36
0
ファイル: constant.py プロジェクト: slabodkin/immuno-probs
def set_num_threads(value=ph.cpu_count()):
    """Sets and updates the global NUM_THREADS variable.

    Parameters
    ----------
    value : int, optional
        The number of threads the program is allowed to use (default: max available threads).

    Raises
    ------
    TypeError
        When the NUM_THREADS global variable is not an integer.
    ValueError
        When the NUM_THREADS global variable is smaller then 1.

    """
    if not isinstance(value, int):
        raise TypeError("The NUM_THREADS variable needs to be of type integer",
                        value)
    if value < 1:
        raise ValueError(
            "The NUM_THREADS variable needs to be higher than zero", value)
    else:
        CONFIG_DATA.set('COMMON', 'NUM_THREADS', str(value))
コード例 #37
0
def check_nodes(pool, state):
    new_pool = type(pool)

    nodes = cpu_count()
    if nodes < 2: return
    half = nodes//2

    res = pool.map(squared, range(2))
    assert res == [0, 1]
    pool.close()

    # doesn't create a new pool... IS IT BETTER IF IT DOES?
    pool = new_pool()
    try:
        pool.map(squared, range(2))
    except PoolClosedError:
        pass
    else:
        raise AssertionError
    
    # creates a new pool (nodes are different)
    def nnodes(pool):
        return getattr(pool, '_'+new_pool.__name__+'__nodes')
    old_nodes = nnodes(pool)
    pool = new_pool(nodes=half)
    new_nodes = nnodes(pool)
    if isinstance(pool, ParallelPool):
        print('SKIPPING: new_pool check for ParallelPool')#FIXME
    else:
        res = pool.map(squared, range(2))
        assert res == [0, 1]
        assert new_nodes < old_nodes

    pool.close()
    try:
        pool.map(squared, range(2))
    except PoolClosedError:
        pass
    else:
        raise AssertionError

    # creates a new pool (nodes are different)
    pool = new_pool()
    if isinstance(pool, ParallelPool):
        print('SKIPPING: new_pool check for ParallelPool')#FIXME
    else:
        res = pool.map(squared, range(2))
        assert res == [0, 1]
    pool.close()
    # doesn't create a new pool... IS IT BETTER IF IT DOES?
    pool = new_pool()
    try:
        pool.map(squared, range(2))
    except PoolClosedError:
        pass
    else:
        raise AssertionError

    assert len(state) == 1
    pool.clear()
    assert len(state) == 0
    pool = new_pool()
    res = pool.map(squared, range(2))
    assert res == [0, 1]
    assert len(state) == 1
    pool.terminate()
    assert len(state) == 1
    pool.clear()
    assert len(state) == 0
    return
コード例 #38
0
         _HOLD = []
    return
def _debug(boolean):
    """if True, print debuging info and save temporary files after pickling"""
    if boolean:
        log.setLevel(logging.DEBUG)
        _save(True)
    else:
        log.setLevel(logging.WARN)
        _save(False)
    return


_pid = '.' + str(os.getpid()) + '.'
defaults = {
    'nodes' : str(cpu_count()),
    'program' : which_strategy(lazy=True) or 'ezscatter.py', # serialize to tempfile
    'mpirun' : which_mpirun() or 'mpiexec',
    'python' : which_python(lazy=True) or 'python',
    'progargs' : '',

    'outfile' : 'results%sout' % _pid,
    'errfile' : 'errors%sout' % _pid,
    'jobfile' : 'job%sid' % _pid,

    'scheduler' : '',
    'timelimit' : '00:02',
    'queue' : 'normal',

    'workdir' : '.'
    }