def get_intersection_buffers(roads, road_bounds, intersection_buffer_units, tile_max_units): """Buffers all intersections :param roads: List of shapely geometries representing road segments :param road_bounds: Bounding box of the roads shapefile :param intersection_buffer_units: Number of units to use for buffer radius :param tile_max_units: Maxium number of units for each side of a tile """ # As an optimization, the road network is divided up into a grid of tiles, # and intersections are calculated within each tile. def roads_per_tile_iter(): """Generator which yields a set of roads for each tile""" min_x, min_y, max_x, max_y = road_bounds bounds_width = max_x - min_x bounds_height = max_y - min_y x_divisions = ceil(bounds_width / tile_max_units) y_divisions = ceil(bounds_height / tile_max_units) tile_width = bounds_width / x_divisions tile_height = bounds_height / y_divisions # Create a spatial index for roads to efficiently match up roads to tiles logger.info('Generating spatial index for intersections') roads_index = rtree.index.Index() for idx, road in enumerate(roads): roads_index.insert(idx, road.bounds) logger.info('Number of tiles: {}'.format(int(x_divisions * y_divisions))) for x_offset in range(0, int(x_divisions)): for y_offset in range(0, int(y_divisions)): road_ids_in_tile = roads_index.intersection([ min_x + x_offset * tile_width, min_y + y_offset * tile_height, min_x + (1 + x_offset) * tile_width, min_y + (1 + y_offset) * tile_height ]) roads_in_tile = [ roads[road_id] for road_id in road_ids_in_tile ] if len(roads_in_tile) > 1: yield roads_in_tile # Allocate one worker per core, and parallelize the discovery of intersections pool = multiprocessing.Pool(multiprocessing.cpu_count()) tile_intersections = pool.imap(get_intersections, roads_per_tile_iter()) pool.close() pool.join() logger.info('Buffering intersections') # Note: tile_intersections is a list of multipoints (which is a list of points). # itertools.chain.from_iterable flattens the list into a list of single points. buffered_intersections = [ intersection.buffer(intersection_buffer_units) for intersection in itertools.chain.from_iterable(tile_intersections) ] # If intersection buffers overlap, union them to treat them as one logger.info('Performing unary union on buffered intersections') return unary_union(buffered_intersections)
def main(argv, _raven_client=None, _stats_client=None): # run for example via: # bin/location_map --create --upload --datamaps=/path/to/datamaps/ \ # --output=ichnaea/content/static/tiles/ parser = argparse.ArgumentParser( prog=argv[0], description='Generate and upload datamap tiles.') parser.add_argument('--create', action='store_true', help='Create tiles?') parser.add_argument('--upload', action='store_true', help='Upload tiles to S3?') parser.add_argument('--concurrency', default=2, help='How many concurrent processes to use?') parser.add_argument('--datamaps', help='Directory of the datamaps tools.') parser.add_argument('--output', help='Optional directory for output files.') args = parser.parse_args(argv[1:]) if args.create: conf = read_config() db_url = conf.get('database', 'rw_url') raven_client = configure_raven( conf.get('sentry', 'dsn'), transport='sync', _client=_raven_client) stats_client = configure_stats(conf, _client=_stats_client) bucketname = conf.get('assets', 'bucket').strip('/') upload = False if args.upload: upload = bool(args.upload) concurrency = billiard.cpu_count() if args.concurrency: concurrency = int(args.concurrency) datamaps = '' if args.datamaps: datamaps = os.path.abspath(args.datamaps) output = None if args.output: output = os.path.abspath(args.output) try: with stats_client.timed('datamaps', tags=['func:main']): generate(db_url, bucketname, raven_client, stats_client, upload=upload, concurrency=concurrency, datamaps=datamaps, output=output) except Exception: # pragma: no cover raven_client.captureException() raise else: # pragma: no cover parser.print_help()
def setup_instance(self, queues=None, ready_callback=None, pidfile=None, include=None, **kwargs): self.pidfile = pidfile self.setup_defaults(kwargs, namespace='celeryd') self.setup_queues(queues) self.setup_includes(include) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready self.use_eventloop = self.should_use_eventloop() signals.worker_init.send(sender=self) # Initialize boot steps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.components = [] self.namespace = Namespace(app=self.app, on_start=self.on_start, on_close=self.on_close, on_stopped=self.on_stopped) self.namespace.apply(self, **kwargs)
def setup_instance(self, queues=None, ready_callback=None, pidfile=None, include=None, **kwargs): self.pidfile = pidfile self.app.loader.init_worker() self.setup_defaults(kwargs, namespace='celeryd') self.setup_queues(queues) self.setup_includes(include) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready self.use_eventloop = self.should_use_eventloop() signals.worker_init.send(sender=self) # Initialize boot steps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.components = [] self.namespace = Namespace(app=self.app).apply(self, **kwargs)
def __init__(self, g_pool): super(Batch_Exporter, self).__init__() self.g_pool = g_pool self.exports = [] self.new_exports = [] self.active_exports = [] default_path = os.path.expanduser('~/Desktop') self.destination_dir = create_string_buffer(default_path, 512) self.source_dir = create_string_buffer(default_path, 512) self.run = False self.workers = [None for x in range(cpu_count())] logger.info( "Using a maximum of %s CPUs to process visualizations in parallel..." % cpu_count())
def __init__( self, filename: str, index: str, name: str, es_host: str, es_port: int, bulk_size: int = 1000, cpu_count: int = cpu_count() - 1, ): self.filename = filename self.index = index self.name = name self.bulk_size = bulk_size self.cpu_count = cpu_count self.folder_path = self.filename + "__tmp" self.offset_stateagentinspector = None self.es_info = {"host": es_host, "port": es_port} self.filelist: Dict[str, Tuple[str, int]] = {} self.ioc_alerts: Dict[str, Any] = {} self.exd_alerts: List[Mapping[str, str]] = [] self.generic_items: Dict[str, Any] = {} es = Elasticsearch([self.es_info]) if not es.ping(): raise ValueError("Connection failed") logging.debug(f"[MAIN] Start parsing {self.filename}.") logging.debug( f"[MAIN] Pushing on {self.name} index and {self.index} timeline")
def setup_instance( self, queues=None, ready_callback=None, pidfile=None, include=None, use_eventloop=None, **kwargs ): self.pidfile = pidfile self.setup_queues(queues) self.setup_includes(include) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready # this connection is not established, only used for params self._conninfo = self.app.connection() self.use_eventloop = self.should_use_eventloop() if use_eventloop is None else use_eventloop self.options = kwargs signals.worker_init.send(sender=self) # Initialize bootsteps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.steps = [] self.on_init_namespace() self.namespace = self.Namespace( app=self.app, on_start=self.on_start, on_close=self.on_close, on_stopped=self.on_stopped ) self.namespace.apply(self, **kwargs)
def main(argv, _raven_client=None, _bucketname=None): # run for example via: # bin/location_map --create --upload \ # --output=ichnaea/content/static/tiles/ parser = argparse.ArgumentParser( prog=argv[0], description="Generate and upload datamap tiles.") parser.add_argument("--create", action="store_true", help="Create tiles?") parser.add_argument("--upload", action="store_true", help="Upload tiles to S3?") parser.add_argument("--concurrency", default=2, help="How many concurrent processes to use?") parser.add_argument("--output", help="Optional directory for output files.") args = parser.parse_args(argv[1:]) if args.create: raven_client = configure_raven(transport="sync", tags={"app": "datamap"}, _client=_raven_client) configure_stats() bucketname = _bucketname if not _bucketname: bucketname = settings("asset_bucket") if bucketname: bucketname = bucketname.strip("/") upload = False if args.upload: upload = bool(args.upload) concurrency = billiard.cpu_count() if args.concurrency: concurrency = int(args.concurrency) output = None if args.output: output = os.path.abspath(args.output) try: with METRICS.timer("datamaps", tags=["func:main"]): generate( bucketname, raven_client, upload=upload, concurrency=concurrency, output=output, ) except Exception: raven_client.captureException() raise else: parser.print_help()
def stream( self, subreddit: str, start_time: int, end_time: int ) -> Iterator[List[Dict[str, Any]]]: for id_iter in query_pushshift(subreddit, start_time, end_time): with Pool(cpu_count(), initializer) as workers: yield list(workers.imap_unordered(praw_by_id, id_iter))
def main(argv, _raven_client=None, _stats_client=None, _bucketname=None): # run for example via: # bin/location_map --create --upload \ # --output=ichnaea/content/static/tiles/ parser = argparse.ArgumentParser( prog=argv[0], description='Generate and upload datamap tiles.') parser.add_argument('--create', action='store_true', help='Create tiles?') parser.add_argument('--upload', action='store_true', help='Upload tiles to S3?') parser.add_argument('--concurrency', default=2, help='How many concurrent processes to use?') parser.add_argument('--output', help='Optional directory for output files.') args = parser.parse_args(argv[1:]) if args.create: raven_client = configure_raven(transport='sync', _client=_raven_client) stats_client = configure_stats(_client=_stats_client) bucketname = _bucketname if not _bucketname: # pragma: no cover bucketname = ASSET_BUCKET if bucketname: bucketname = bucketname.strip('/') upload = False if args.upload: upload = bool(args.upload) concurrency = billiard.cpu_count() if args.concurrency: concurrency = int(args.concurrency) output = None if args.output: output = os.path.abspath(args.output) try: with stats_client.timed('datamaps', tags=['func:main']): generate(bucketname, raven_client, stats_client, upload=upload, concurrency=concurrency, output=output) except Exception: # pragma: no cover raven_client.captureException() raise else: # pragma: no cover parser.print_help()
def __init__(self, hostname=None, purge=False, beat=False, queues=None, include=None, app=None, pidfile=None, autoscale=None, autoreload=False, no_execv=False, no_color=None, **kwargs): self.app = app = app_or_default(app or self.app) self.hostname = hostname or socket.gethostname() # this signal can be used to set up configuration for # workers by name. signals.celeryd_init.send(sender=self.hostname, instance=self, conf=self.app.conf) self.setup_defaults(kwargs, namespace='celeryd') if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 self.purge = purge self.beat = beat self.use_queues = [] if queues is None else queues self.queues = None self.include = include self.pidfile = pidfile self.autoscale = None self.autoreload = autoreload self.no_color = no_color self.no_execv = no_execv if autoscale: max_c, _, min_c = autoscale.partition(',') self.autoscale = [int(max_c), min_c and int(min_c) or 0] self._isatty = isatty(sys.stdout) self.colored = app.log.colored( self.logfile, enabled=not no_color if no_color is not None else no_color) if isinstance(self.use_queues, basestring): self.use_queues = self.use_queues.split(',') if self.include: if isinstance(self.include, basestring): self.include = self.include.split(',') app.conf.CELERY_INCLUDE = (tuple(app.conf.CELERY_INCLUDE) + tuple(self.include)) self.loglevel = mlevel(self.loglevel)
def __init__(self, hostname=None, discard=False, embed_clockservice=False, queues=None, include=None, app=None, pidfile=None, autoscale=None, autoreload=False, **kwargs): self.app = app = app_or_default(app or self.app) self.hostname = hostname or socket.gethostname() # this signal can be used to set up configuration for # workers by name. signals.celeryd_init.send(sender=self.hostname, instance=self, conf=self.app.conf) self.setup_defaults(kwargs, namespace="celeryd") if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 self.discard = discard self.embed_clockservice = embed_clockservice if self.app.IS_WINDOWS and self.embed_clockservice: self.die("-B option does not work on Windows. " "Please run celerybeat as a separate service.") self.use_queues = [] if queues is None else queues self.queues = None self.include = [] if include is None else include self.pidfile = pidfile self.autoscale = None self.autoreload = autoreload if autoscale: max_c, _, min_c = autoscale.partition(",") self.autoscale = [int(max_c), min_c and int(min_c) or 0] self._isatty = isatty(sys.stdout) self.colored = app.log.colored(self.logfile) if isinstance(self.use_queues, basestring): self.use_queues = self.use_queues.split(",") if isinstance(self.include, basestring): self.include = self.include.split(",") try: self.loglevel = mlevel(self.loglevel) except KeyError: self.die("Unknown level %r. Please use one of %s." % (self.loglevel, "|".join(l for l in LOG_LEVELS.keys() if isinstance(l, basestring))))
def main(): parser = argparse.ArgumentParser( description="Push .mans information in Elasticsearch index", prog="MANS to ES") # Required parameters parser.add_argument("--filename", dest="filename", help="Path of the .mans file") parser.add_argument("--name", dest="name", help="Timeline name") parser.add_argument("--index", dest="index", help="ES index name") parser.add_argument("--es_host", dest="es_host", help="ES host") parser.add_argument("--es_port", dest="es_port", help="ES port") # Optional parameters to increase performances parser.add_argument( "--cpu_count", dest="cpu_count", default=cpu_count() - 1, help="cpu count", type=int, ) parser.add_argument( "--bulk_size", dest="bulk_size", default=1000, help="Bulk size for multiprocessing parsing and upload", type=int, ) parser.add_argument("--version", dest="version", action="version", version="%(prog)s 1.5") args = parser.parse_args() if not all([args.name, args.index, args.es_port, args.es_host]): parser.print_usage() else: try: mte = MansToEs( args.filename, args.index, args.name, args.es_host, args.es_port, args.bulk_size, args.cpu_count, ) mte.run() logging.debug("[MAIN] Operation Completed [✔✔✔]") except: logging.exception("Error parsing .mans") return False return True
def praw_memes(self, verbose: bool) -> Iterator[List[redditData]]: for ids in self.query_pushshift(): with cast(mpPool, Pool(cpu_count(), initializer)) as workers: if verbose: memes: list[Union[redditData, None]] = list( tqdm(workers.imap_unordered(praw_by_id, ids))) else: memes = list(workers.imap_unordered(praw_by_id, ids)) yield [ meme for meme in memes if meme and meme["username"] != "None" ]
def main(argv, _raven_client=None, _stats_client=None, _bucketname=None): # run for example via: # bin/location_map --create --upload \ # --output=ichnaea/content/static/tiles/ parser = argparse.ArgumentParser( prog=argv[0], description='Generate and upload datamap tiles.') parser.add_argument('--create', action='store_true', help='Create tiles?') parser.add_argument('--upload', action='store_true', help='Upload tiles to S3?') parser.add_argument('--concurrency', default=2, help='How many concurrent processes to use?') parser.add_argument('--output', help='Optional directory for output files.') args = parser.parse_args(argv[1:]) if args.create: raven_client = configure_raven( transport='sync', _client=_raven_client) stats_client = configure_stats(_client=_stats_client) bucketname = _bucketname if not _bucketname: # pragma: no cover bucketname = ASSET_BUCKET if bucketname: bucketname = bucketname.strip('/') upload = False if args.upload: upload = bool(args.upload) concurrency = billiard.cpu_count() if args.concurrency: concurrency = int(args.concurrency) output = None if args.output: output = os.path.abspath(args.output) try: with stats_client.timed('datamaps', tags=['func:main']): generate(bucketname, raven_client, stats_client, upload=upload, concurrency=concurrency, output=output) except Exception: # pragma: no cover raven_client.captureException() raise else: # pragma: no cover parser.print_help()
def get_intersection_buffers(roads, road_bounds, intersection_buffer_units, tile_max_units): """Buffers all intersections :param roads: List of shapely geometries representing road segments :param road_bounds: Bounding box of the roads shapefile :param intersection_buffer_units: Number of units to use for buffer radius :param tile_max_units: Maxium number of units for each side of a tile """ # As an optimization, the road network is divided up into a grid of tiles, # and intersections are calculated within each tile. def roads_per_tile_iter(): """Generator which yields a set of roads for each tile""" min_x, min_y, max_x, max_y = road_bounds bounds_width = max_x - min_x bounds_height = max_y - min_y x_divisions = ceil(bounds_width / tile_max_units) y_divisions = ceil(bounds_height / tile_max_units) tile_width = bounds_width / x_divisions tile_height = bounds_height / y_divisions # Create a spatial index for roads to efficiently match up roads to tiles logger.info('Generating spatial index for intersections') roads_index = rtree.index.Index() for idx, road in enumerate(roads): roads_index.insert(idx, road.bounds) logger.info('Number of tiles: {}'.format(int(x_divisions * y_divisions))) for x_offset in range(0, int(x_divisions)): for y_offset in range(0, int(y_divisions)): road_ids_in_tile = roads_index.intersection([ min_x + x_offset * tile_width, min_y + y_offset * tile_height, min_x + (1 + x_offset) * tile_width, min_y + (1 + y_offset) * tile_height ]) roads_in_tile = [roads[road_id] for road_id in road_ids_in_tile] if len(roads_in_tile) > 1: yield roads_in_tile # Allocate one worker per core, and parallelize the discovery of intersections pool = multiprocessing.Pool(multiprocessing.cpu_count()) tile_intersections = pool.imap(get_intersections, roads_per_tile_iter()) pool.close() pool.join() logger.info('Buffering intersections') # Note: tile_intersections is a list of multipoints (which is a list of points). # itertools.chain.from_iterable flattens the list into a list of single points. buffered_intersections = [intersection.buffer(intersection_buffer_units) for intersection in itertools.chain.from_iterable(tile_intersections)] # If intersection buffers overlap, union them to treat them as one logger.info('Performing unary union on buffered intersections') return unary_union(buffered_intersections)
def __init__( self, hostname=None, purge=False, beat=False, queues=None, include=None, app=None, pidfile=None, autoscale=None, autoreload=False, no_execv=False, no_color=None, **kwargs ): self.app = app = app_or_default(app or self.app) self.hostname = hostname or socket.gethostname() # this signal can be used to set up configuration for # workers by name. signals.celeryd_init.send(sender=self.hostname, instance=self, conf=self.app.conf) self.setup_defaults(kwargs, namespace="celeryd") if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 self.purge = purge self.beat = beat self.use_queues = [] if queues is None else queues self.queues = None self.include = include self.pidfile = pidfile self.autoscale = None self.autoreload = autoreload self.no_color = no_color self.no_execv = no_execv if autoscale: max_c, _, min_c = autoscale.partition(",") self.autoscale = [int(max_c), min_c and int(min_c) or 0] self._isatty = isatty(sys.stdout) self.colored = app.log.colored(self.logfile, enabled=not no_color if no_color is not None else no_color) if isinstance(self.use_queues, basestring): self.use_queues = self.use_queues.split(",") if self.include: if isinstance(self.include, basestring): self.include = self.include.split(",") app.conf.CELERY_INCLUDE = tuple(app.conf.CELERY_INCLUDE) + tuple(self.include) self.loglevel = mlevel(self.loglevel)
def run(self): tasks = mp.Queue() results = mp.JoinableQueue() interim = [] args = (tasks, results) # n_procs = min(mp.cpu_count(), len(self._videos)) n_procs = mp.cpu_count() all_jobs = [] for video_gt_pair in self._videos: gt = video_gt_pair[0] fp = video_gt_pair[1] for func in self._funcs: func_name = func[0] func_ptr = func[1] base_params = { 'gt_path': gt, 'video_path': fp, 'metric_func': func_ptr, 'init': False } for classifier in self._classifiers: params = base_params.copy() params.update(self._experiment_args) params['classifier'] = classifier(metric=func_name) log.info("Params ({}): {}".format(id(params), params)) all_jobs.append((params, self._experiment)) for job in all_jobs: tasks.put(job) for _ in range(n_procs): p = mp.Process(target=train_classifier, args=args).start() for _ in range(len(all_jobs)): interim.append(results.get()) results.task_done() for _ in range(n_procs): tasks.put(None) results.join() tasks.close() results.close() return interim
def __init__(self, g_pool): super(Batch_Exporter, self).__init__() self.g_pool = g_pool self.exports = [] self.new_exports = [] self.active_exports = [] default_path = os.path.expanduser('~/Desktop') self.destination_dir = create_string_buffer(default_path,512) self.source_dir = create_string_buffer(default_path,512) self.run = False self.workers = [None for x in range(cpu_count())] logger.info("Using a maximum of %s CPUs to process visualizations in parallel..." %cpu_count())
def __init__(self, *args, **options): concurrency = options.get('concurrency', 0) if concurrency is None or int(concurrency) < 0: concurrency = cpu_count() self.concurrency = int(concurrency) self.stream = sys.stderr self.original_stream = self.stream try: self.ramdb = settings.TEST_RUNNER_RAMDB except AttributeError: self.ramdb = options.get('ramdb', '') try: save = settings.LOCAL_CACHE except AttributeError: save = 'local_cache' self.ramdb_saves = os.path.join(os.getcwd(), save) super(DiscoverRoadRunner, self).__init__(*args, **options)
def setup_instance(self, queues=None, ready_callback=None, pidfile=None, include=None, use_eventloop=None, exclude_queues=None, **kwargs): self.pidfile = pidfile self.setup_queues(queues, exclude_queues) self.setup_includes(str_to_list(include)) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready # this connection won't establish, only used for params self._conninfo = self.app.connection_for_read() self.use_eventloop = (self.should_use_eventloop() if use_eventloop is None else use_eventloop) self.options = kwargs signals.worker_init.send(sender=self) # Initialize bootsteps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.steps = [] self.on_init_blueprint() # 创建启动蓝图 self.blueprint = self.Blueprint( steps=self.app.steps['worker'], on_start=self.on_start, on_close=self.on_close, on_stopped=self.on_stopped, ) # 创建蓝图的steps self.blueprint.apply(self, **kwargs)
def __init__( self, mode: str, filename: str, index: str = None, sketch_id: int = None, sketch_name: str = None, sketch_description: str = None, timeline_name: str = None, es_host: str = None, es_port: int = None, bulk_size: int = 1000, cpu_count: int = cpu_count() - 1, ): self.mode = mode self.filename = filename self.index = index self.sketch = None self.timeline_name = timeline_name self.bulk_size = bulk_size self.cpu_count = cpu_count self.folder_path = self.filename + "__tmp" self.offset_stateagentinspector = None self.es_info = {"host": es_host, "port": es_port} self.filelist: Dict[str, Tuple[str, int]] = {} self.ioc_alerts: Dict[str, Any] = {} self.exd_alerts: List[Mapping[str, str]] = [] self.generic_items: Dict[str, Any] = {} # initial check, es is up or timesketch conf are ok if self.mode == "elastic": es = Elasticsearch([self.es_info]) if not es.ping(): raise ValueError("Connection failed") elif self.mode == "timesketch": ts = config.get_client() if sketch_id: self.sketch = ts.get_sketch(int(sketch_id)) elif sketch_name: self.sketch = ts.create_sketch(sketch_name, sketch_description) logging.debug(f"[MAIN] Start parsing {self.filename} [{self.mode}].")
def __init__(self, targets, ports=range(65536), threads=100, timeout=3, proxy_ip=["127.0.0.1", "127.0.0.1"], proxy_port=[80, 80]): self._targets_ = targets self._ports_ = ports self._threads_ = threads self._timeout_ = timeout self._proxy_ip_ = proxy_ip self._proxy_port_ = proxy_port self._worker_pool_ = [] self._worker_count_ = cpu_count() self._job_len_ = len(targets) self._scanners_ = [ Scan(self._targets_[i], self._ports_, self._threads_, self._timeout_, self._proxy_ip_, self._proxy_port_) for i in range(self._job_len_) ] self._scan_secure_ = [ Scan(self._targets_[i], self._ports_, self._threads_, self._timeout_, self._proxy_ip_[0], self._proxy_port_[0]) for i in range(self._job_len_) ] self._scan_unsecure_ = [ Scan(self._targets_[i], self._ports_, self._threads_, self._timeout_, self._proxy_ip_[1], self._proxy_port_[1]) for i in range(self._job_len_) ] self._manager_ = billiard.Manager() self._log_ = self._manager_.dict() self._proxy_log_ = self._manager_.dict() self._total_runtime_ = 0
def setup_instance(self, queues=None, ready_callback=None, pidfile=None, include=None, use_eventloop=None, **kwargs): self.pidfile = pidfile self.setup_defaults(kwargs, namespace='celeryd') self.setup_queues(queues) self.setup_includes(include) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready # this connection is not established, only used for params self._conninfo = self.app.connection() self.use_eventloop = (self.should_use_eventloop() if use_eventloop is None else use_eventloop) self.options = kwargs signals.worker_init.send(sender=self) # Initialize bootsteps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.steps = [] self.on_init_namespace() self.namespace = self.Namespace(app=self.app, on_start=self.on_start, on_close=self.on_close, on_stopped=self.on_stopped) self.namespace.apply(self, **kwargs)
def setup_instance(self, queues=None, ready_callback=None, pidfile=None, include=None, use_eventloop=None, exclude_queues=None, **kwargs): self.pidfile = pidfile self.setup_queues(queues, exclude_queues) self.setup_includes(str_to_list(include)) # Set default concurrency if not self.concurrency: try: self.concurrency = cpu_count() except NotImplementedError: self.concurrency = 2 # Options self.loglevel = mlevel(self.loglevel) self.ready_callback = ready_callback or self.on_consumer_ready # this connection won't establish, only used for params self._conninfo = self.app.connection_for_read() self.use_eventloop = ( self.should_use_eventloop() if use_eventloop is None else use_eventloop ) self.options = kwargs signals.worker_init.send(sender=self) # Initialize bootsteps self.pool_cls = _concurrency.get_implementation(self.pool_cls) self.steps = [] self.on_init_blueprint() self.blueprint = self.Blueprint( steps=self.app.steps['worker'], on_start=self.on_start, on_close=self.on_close, on_stopped=self.on_stopped, ) self.blueprint.apply(self, **kwargs)
'PORT': 5432 # in memory post } } CELERY_TASK_ALWAYS_EAGER = True CELERY_TASK_EAGER_PROPAGATES = True SOUTH_TESTS_MIGRATE = False #DATABASES = { #'default': { #'ENGINE': 'django.db.backends.sqlite3', #'NAME': ':memory:' #} #} ZEUS_MIXNET_NR_PARALLEL = billiard.cpu_count() ZEUS_MIXNET_NR_ROUNDS = 16 ZEUS_ELECTION_STREAM_HANDLER = os.environ.get("ZEUS_TESTS_VERBOSE", False) EMAIL_SUBJECT_PREFIX = 'Zeus System Message: ' def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
def default_concurrency(): return multiprocessing.cpu_count()
def main(): parser = argparse.ArgumentParser( description="Push .mans information in ElasticSearch index", prog="MANS to ES") # Required parameters parser.add_argument("--filename", dest="filename", required=True, help="Path of the .mans file") # TimeSketch parameters timesketch = argparse.ArgumentParser(add_help=False) timesketch.add_argument("--sketch_id", dest="sketch_id", help="TimeSketch Sketch id") timesketch.add_argument("--sketch_name", dest="sketch_name", help="TimeSketch Sketch name") timesketch.add_argument( "--sketch_description", dest="sketch_description", help="TimeSketch Sketch description", ) timesketch.add_argument("--timeline_name", dest="timeline_name", help="TimeSketch Timeline Name") # Elastic parameters elastic = argparse.ArgumentParser(add_help=False) elastic.add_argument("--index", dest="index", help="ElasticSearch Index name") elastic.add_argument("--es_host", dest="es_host", help="ElasticSearch host") elastic.add_argument("--es_port", dest="es_port", help="ElasticSearch port") sp = parser.add_subparsers(required=True, dest="mode") sp_elastic = sp.add_parser("elastic", parents=[elastic], help="Save data in elastic") sp_timesketch = sp.add_parser("timesketch", parents=[timesketch], help="Save data in TimeSketch") # Optional parameters to increase performances parser.add_argument( "--cpu_count", dest="cpu_count", default=cpu_count() - 1, help="cpu count", type=int, ) parser.add_argument( "--bulk_size", dest="bulk_size", default=1000, help="Bulk size for multiprocessing parsing and upload", type=int, ) parser.add_argument("--version", dest="version", action="version", version="%(prog)s 1.7") args = parser.parse_args() if args.mode == "elastic": if not all([args.index, args.es_port, args.es_host]): sp_elastic.print_help() return False else: mte = MansToEs( mode=args.mode, filename=args.filename, index=args.index, es_host=args.es_host, es_port=args.es_port, bulk_size=args.bulk_size, cpu_count=args.cpu_count, ) mte.run() return True elif args.mode == "timesketch": if (not any( [args.sketch_id, args.sketch_name, args.sketch_description]) or all([ args.sketch_id, args.sketch_name, args.sketch_description ]) or not args.timeline_name): sp_timesketch.print_help() return False else: mte = MansToEs( mode=args.mode, filename=args.filename, sketch_id=args.sketch_id, sketch_name=args.sketch_name, sketch_description=args.sketch_description, timeline_name=args.timeline_name, bulk_size=args.bulk_size, cpu_count=args.cpu_count, ) mte.run() return True
def run_exp_for_all_classifiers(save_dir=DIR_CLASSIFIERS, parallel=True): """ Runs all the saved classifiers that are located in save_dir. parallel, if True, will use the multiprocessing module to run multiple experiments at the same time. At present, however, this is broken due to the way in which Python processes match up to C-lib extensions. In this case, OpenCV just kinda dies when processing is attempted in this manner. Currently investigating a fix -- until then, just run linear or via threads. """ classifiers = EXPClassifierHandler.get_all_saved_classifiers(DIR_CLASSIFIERS) classifiers = [x for x in classifiers if not x.endswith(".csv")] if len(classifiers) == 0: log.info("No more experiments to run, exiting.") return if parallel: videos_to_classifiers = {} for c in classifiers: clf = load_saved_classifier(save_dir + c) file_name = clf.video_path.split("/")[-1] if file_name not in videos_to_classifiers: videos_to_classifiers[file_name] = [] clfid = (clf.identifier, c) videos_to_classifiers[file_name].append(clfid) # So now we've mapped video_file: [classifiers], multiproc by k tasks = mp.Queue() results = mp.JoinableQueue() interim = [] args = (tasks, results, save_dir) n_procs = min(mp.cpu_count(), len(videos_to_classifiers.keys())) for k in videos_to_classifiers.keys(): these_classifiers = videos_to_classifiers[k] tasks.put(these_classifiers) delegator = EXPClassifierHandler.run_exp_from_mp_queue for _ in range(n_procs): p = mp.Process(target=delegator, args=args).start() for _ in range(len(videos_to_classifiers.keys())): interim.append(results.get()) results.task_done() for _ in range(n_procs): tasks.put(None) results.join() tasks.close() results.close() else: for c in classifiers: EXPClassifierHandler.run_exp_for_classifier(c, save_dir) # Maybe by the time we get here more will be waiting... keep going EXPClassifierHandler.run_exp_for_all_classifiers(save_dir, parallel)
event_ids, event_dicts = zip(*event_tuples) with Pool(parallel_jobs(n)) as pool: event_bytes = pool.map(event_from_dict, event_dicts) events = pool.map(bytes_to_event, event_bytes) args = zip(*(events, [list_name] * len(events))) dicts = pool.starmap(extract_attr, args) args = zip(*(dicts, [field] * len(dicts), [value] * len(dicts))) matches = pool.starmap(find_first_by, args) for i, detection in enumerate(matches): if detection: return detection, event_ids[i], events[i].correlations next_id = decrement_id(event_ids[-1]) return None, None, None def find_first_by(dicts, field, value): for d in dicts: if d[field] == value: return d return None def event_from_dict(x): return next(iter(x.values())) parallel_jobs = lambda x: min(x, cpu_count())
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram. The default is 10. check_correlation : boolean Whether or not to check correlation. It's `True` by default. correlation_threshold: float Threshold to determine if the variable pair is correlated. The default is 0.9. correlation_overrides : list Variable names not to be rejected because they are correlated. There is no variable in the list (`None`) by default. check_recoded : boolean Whether or not to check recoded correlation (memory heavy feature). Since it's an expensive computation it can be activated for small datasets. `check_correlation` must be true to disable this check. It's `False` by default. pool_size : int Number of workers in thread pool The default is equal to the number of CPU. Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) # Clearing the cache before computing stats base.clear_cache() if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() kwargs.update({'bins': bins}) # Describe all variables in a univariate way if pool_size == 1: local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())} else: pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Get correlations dfcorrPear = df.corr(method="pearson") dfcorrSpear = df.corr(method="spearman") # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = dfcorrPear.copy() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > correlation_threshold: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) if check_recoded: categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {} table_stats['n'] = len(df) table_stats['nvar'] = len(df.columns) table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist() table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0 memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns}, 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear} }
def run_exp_for_all_classifiers(save_dir=DIR_CLASSIFIERS, parallel=True): """ Runs all the saved classifiers that are located in save_dir. parallel, if True, will use the multiprocessing module to run multiple experiments at the same time. At present, however, this is broken due to the way in which Python processes match up to C-lib extensions. In this case, OpenCV just kinda dies when processing is attempted in this manner. Currently investigating a fix -- until then, just run linear or via threads. """ classifiers = EXPClassifierHandler.get_all_saved_classifiers( DIR_CLASSIFIERS) classifiers = [x for x in classifiers if not x.endswith(".csv")] if len(classifiers) == 0: log.info("No more experiments to run, exiting.") return if parallel: videos_to_classifiers = {} for c in classifiers: clf = load_saved_classifier(save_dir + c) file_name = clf.video_path.split("/")[-1] if file_name not in videos_to_classifiers: videos_to_classifiers[file_name] = [] clfid = (clf.identifier, c) videos_to_classifiers[file_name].append(clfid) # So now we've mapped video_file: [classifiers], multiproc by k tasks = mp.Queue() results = mp.JoinableQueue() interim = [] args = (tasks, results, save_dir) n_procs = min(mp.cpu_count(), len(videos_to_classifiers.keys())) for k in videos_to_classifiers.keys(): these_classifiers = videos_to_classifiers[k] tasks.put(these_classifiers) delegator = EXPClassifierHandler.run_exp_from_mp_queue for _ in range(n_procs): p = mp.Process(target=delegator, args=args).start() for _ in range(len(videos_to_classifiers.keys())): interim.append(results.get()) results.task_done() for _ in range(n_procs): tasks.put(None) results.join() tasks.close() results.close() else: for c in classifiers: EXPClassifierHandler.run_exp_for_classifier(c, save_dir) # Maybe by the time we get here more will be waiting... keep going EXPClassifierHandler.run_exp_for_all_classifiers(save_dir, parallel)