Example #1
0
 def __init__(self, spider):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = Crawler(spider.__class__, settings)
     self.crawler.signals.connect(reactor.stop,
                                  signal=signals.spider_closed)
     self.spider = spider
Example #2
0
 def __init__(self, spider):
     Process.__init__(self)
     os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'crawler.gov.gov.settings')
     settings = get_project_settings()
     self.crawler = Crawler(spider.__class__, settings)
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
Example #3
0
def _get_24h_price_ticker_data(jobs,
                               logger,
                               exchange_class,
                               schema_class,
                               symbol=None,
                               pairs=None):
    socketio = None
    if TickerSettings.enable_socket_io:
        socketio = SocketIO(message_queue=BROKER_URL)

    symbol_or_pairs = '-'.join(symbol) if symbol else 'PAIRS'

    p = Process(name='{} {}'.format(exchange_class.__name__, symbol_or_pairs),
                target=_process,
                args=(
                    logger,
                    socketio,
                    exchange_class,
                    schema_class,
                    symbol,
                    pairs,
                ))
    jobs.append(
        dict(job=p,
             timeout=s.TIMEOUT_PER_SYMBOL_REQUEST
             if symbol else s.TIMEOUT_PER_SYMBOLS_REQUEST))
    p.start()
Example #4
0
 def __init__(self, spider):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = CrawlerProcess(settings)
     # self.crawler.configure()
     # self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
Example #5
0
def _get_24h_price_ticker_data(jobs,
                               logger,
                               exchange_class,
                               schema_class,
                               symbol=None,
                               pairs=None,
                               *_,
                               **kwargs):
    socketio = None

    if kwargs["enable_messaging"]:
        socketio = SocketIO(message_queue=BROKER_URL)

    symbol_or_pairs = "-".join(symbol) if symbol else "PAIRS"

    p = Process(
        name="{} {}".format(exchange_class.__name__, symbol_or_pairs),
        target=_process,
        args=(
            logger,
            socketio,
            exchange_class,
            schema_class,
            symbol,
            pairs,
        ),
    )
    jobs.append(
        dict(
            job=p,
            timeout=s.TIMEOUT_PER_SYMBOL_REQUEST
            if symbol else s.TIMEOUT_PER_SYMBOLS_REQUEST,
        ))
    p.start()
Example #6
0
 def __init__(self, spider, accumulator):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = Crawler(spider.__class__, settings)
     self.crawler.signals.connect(self.gather_results,
                                  signal=signals.item_passed)
     self.crawler.signals.connect(reactor.stop,
                                  signal=signals.spider_closed)
     self.spider = spider
     self.accumulator = accumulator
    def run_proxy_scan(self, safe_flag):

        self.set_total_runtime(time.time())

        self.set_proxy_log(self._manager_.dict())

        i = 0
        while i < self.get_job_len():
            self._worker_pool_ = []
            for _ in range(self._worker_count_):
                if i >= self.get_job_len():
                    break

                if safe_flag:
                    p = Process(target=self._scan_secure_[i].run_proxy,
                                args=(self.get_proxy_log(), True))
                else:
                    p = Process(target=self._scan_unsecure_[i].run_proxy,
                                args=(self.get_proxy_log(), True))

                p.start()
                self._worker_pool_.append(p)

                i += 1

            for p in self._worker_pool_:
                p.join()

        self.set_total_runtime(time.time() - self.get_total_runtime())
        return self.get_proxy_log()
Example #8
0
def update(self):
    logger = self.get_logger()
    jobs = []

    try:
        for coin_or_token, struct in s.EXCHANGES_AND_PAIRS_OF_REFERENCE.items(
        ):

            if "market_depth" not in struct:
                continue

            quote = struct["pair"]
            exchange = struct["market_depth"]

            _method = globals()[f"_process_{exchange}"]

            p = Process(
                name=f"{coin_or_token}-{quote}",
                target=_method,
                args=(
                    logger,
                    (
                        coin_or_token,
                        quote,
                    ),
                ),
            )

            jobs.append(dict(job=p, timeout=s.ORDER_BOOK))

            p.start()

        for j in jobs:
            j["job"].join(timeout=j["timeout"])

    except Exception as error:
        _terminate_running_jobs(logger, jobs)
        logger.error("order_book error: {}".format(str(error)))
        self.update_state(state=states.FAILURE, meta=str(error))
        raise Ignore()
    finally:
        _terminate_running_jobs(logger, jobs)
Example #9
0
def process(text: str) -> list:
    pipe_return, pipe_receive = multiprocessing.Pipe(False)
    cpu_count = multiprocessing.cpu_count()
    process_list = []
    str_parts = text.split('_')
    for i in range(cpu_count):
        try:
            executor = Process(target=str_target_executor,
                               args=(i, str_parts[i], pipe_receive))
            process_list.append(executor)
            executor.start()
        except IndexError:
            continue

    last_list = []
    for _ in process_list:
        index, str_part = pipe_return.recv()
        last_list.append((index, str_part))

    for executor in process_list:
        executor.join()

    print(last_list)
    last_list.sort(key=lambda a: a[0])
    print(' '.join(map(lambda a: a[1], last_list)))
Example #10
0
def single_crawling_task(page_num, spider_idx):
    spider = spiders[spider_idx]
    crawl_spider.page_num = page_num
    manager = Manager()
    return_dic = manager.dict()
    cc = CustomCrawler()
    proc = Process(
        target=cc.crawling_start, 
        args=(
            get_scrapy_settings(),
            spider,
            spider.__name__[:spider.__name__.find('Spider')].lower(),
            return_dic,
        )
    )
    try_time = 0
    is_success = False
    while try_time < 2 and not is_success:
        try:
            proc.start()
            proc.join()
            is_success = True
        except Exception as e:
            print(e)
            try_time += 1
    if try_time == 2:
        raise Exception("사이트에 연결하지 못했습니다. {spider.__name__}")
    return dict(return_dic)
    def run_full_scan(self):
        """

            Run a complete scan of off a list of IPs for the entire port
            range.

        """

        self.set_total_runtime(time.time())

        i = 0
        while i < self.get_job_len():
            self._worker_pool_ = []
            for _ in range(self.get_worker_count()):
                if i >= self.get_job_len():
                    break

                p = Process(target=self._scanners_[i].run,
                            args=(self.get_log(), ))
                p.start()
                self._worker_pool_.append(p)

                i += 1

            for p in self._worker_pool_:
                p.join()

        self.set_total_runtime(time.time() - self.get_total_runtime())
        return self.get_log()
Example #12
0
def analyze_input_wrapper_in_process(uuid, box, task, actions_list,
                                     process_wait_time):
    ret = False
    chrome_ouput, screen_recorder = None, None
    with kill_vm_on_enter_and_exit(uuid, box):
        p = Process(target=analyze_input_in_vm, args=(uuid, box, actions_list))
        p.start()
        p.join(process_wait_time)
        if p.is_alive():
            log_string(uuid, "Process time out, PID: {}".format(p.pid), "Red")
            pProcess(p.pid).kill()
        else:
            if p.exitcode == True:
                log_string(uuid, "custom_task finished successfully", "Green")
                ret = True
            else:
                log_string(
                    uuid, "custom_task finished with errors, exit code False",
                    "Red")
    return ret
Example #13
0
def test_dummy_in_process(uuid, box, process_wait_time):
    ret = False
    with kill_vm_and_keys_on_enter_and_exit(uuid, box):
        p = Process(target=test_dummy, args=(uuid, box))
        p.start()
        p.join(process_wait_time)
        if p.is_alive():
            log_string(uuid, "Process time out, PID: {}".format(p.pid), "Red")
            pProcess(p.pid).kill()
        else:
            if p.exitcode == True:
                log_string(uuid, "test_dummy_in_process finished successfully",
                           "Green")
                ret = True
            else:
                log_string(
                    uuid,
                    "test_dummy_in_process finished with errors, exit code False",
                    "Red")
    return ret
Example #14
0
def run_spider(spider, settings, kwargs=None):
    def f(q):
        try:
            # configure_logging(settings)
            runner = CrawlerRunner()
            if kwargs is not None:
                deferred = runner.crawl(spider, **kwargs)
            else:
                deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q, ))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result
Example #15
0
 def __init__(self, website_id, force=False):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = CrawlerProcess(settings)
     self.website_id = website_id
     self.force = force
Example #16
0
 def __init__(self, website_id):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = CrawlerProcess(settings)
     self.website_id = website_id
Example #17
0
def create_static_mix(static_mix_id):
    """
    Task to create static mix and write to appropriate storage backend.
    :param static_mix_id: The id of the StaticMix to be processed
    """
    # Mark as in progress
    try:
        static_mix = StaticMix.objects.get(id=static_mix_id)
    except StaticMix.DoesNotExist:
        # Does not exist, perhaps due to stale task
        print('StaticMix does not exist')
        return
    static_mix.status = TaskStatus.IN_PROGRESS
    static_mix.save()

    try:
        # Get paths
        directory = os.path.join(settings.MEDIA_ROOT, settings.SEPARATE_DIR,
                                 static_mix_id)
        filename = get_valid_filename(static_mix.formatted_name()) + '.mp3'
        rel_media_path = os.path.join(settings.SEPARATE_DIR, static_mix_id,
                                      filename)
        rel_path = os.path.join(settings.MEDIA_ROOT, rel_media_path)
        rel_path_dir = os.path.join(settings.MEDIA_ROOT, settings.SEPARATE_DIR,
                                    static_mix_id)

        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
        separator = get_separator(static_mix.separator,
                                  static_mix.separator_args,
                                  static_mix.bitrate, settings.CPU_SEPARATION)

        parts = {
            'vocals': static_mix.vocals,
            'drums': static_mix.drums,
            'bass': static_mix.bass,
            'other': static_mix.other
        }

        # Non-local filesystems like S3/Azure Blob do not support source_path()
        is_local = settings.DEFAULT_FILE_STORAGE == 'api.storage.FileSystemStorage'
        path = static_mix.source_path() if is_local else static_mix.source_url(
        )

        if not settings.CPU_SEPARATION:
            # For GPU separation, do separation in separate process.
            # Otherwise, GPU memory is not automatically freed afterwards
            process_eval = Process(target=separator.create_static_mix,
                                   args=(parts, path, rel_path))
            process_eval.start()
            try:
                process_eval.join()
            except SoftTimeLimitExceeded as e:
                # Kill process if user aborts task
                process_eval.terminate()
                raise e
        else:
            separator.create_static_mix(parts, path, rel_path)

        # Check file exists
        if os.path.exists(rel_path):
            static_mix.status = TaskStatus.DONE
            if is_local:
                # File is already on local filesystem
                static_mix.file.name = rel_media_path
            else:
                # Need to copy local file to S3/Azure Blob/etc.
                raw_file = open(rel_path, 'rb')
                content_file = ContentFile(raw_file.read())
                content_file.name = filename
                static_mix.file = content_file
                # Remove local file
                os.remove(rel_path)
                # Remove empty directory
                os.rmdir(rel_path_dir)
            static_mix.save()
        else:
            raise Exception('Error writing to file')
    except FileNotFoundError as error:
        print(error)
        print('Please make sure you have FFmpeg and FFprobe installed.')
        static_mix.status = TaskStatus.ERROR
        static_mix.error = str(error)
        static_mix.save()
    except SoftTimeLimitExceeded:
        print('Aborted!')
    except Exception as error:
        print(error)
        static_mix.status = TaskStatus.ERROR
        static_mix.error = str(error)
        static_mix.save()
Example #18
0
def create_dynamic_mix(dynamic_mix_id):
    """
    Task to create dynamic mix and write to appropriate storage backend.
    :param dynamic_mix_id: The id of the audio track model (StaticMix) to be processed
    """
    # Mark as in progress
    try:
        dynamic_mix = DynamicMix.objects.get(id=dynamic_mix_id)
    except DynamicMix.DoesNotExist:
        # Does not exist, perhaps due to stale task
        print('DynamicMix does not exist')
        return
    dynamic_mix.status = TaskStatus.IN_PROGRESS
    dynamic_mix.save()

    try:
        # Get paths
        directory = os.path.join(settings.MEDIA_ROOT, settings.SEPARATE_DIR,
                                 dynamic_mix_id)
        rel_media_path = os.path.join(settings.SEPARATE_DIR, dynamic_mix_id)
        file_prefix = get_valid_filename(dynamic_mix.formatted_prefix())
        file_suffix = dynamic_mix.formatted_suffix()
        rel_path = os.path.join(settings.MEDIA_ROOT, rel_media_path)

        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
        separator = get_separator(dynamic_mix.separator,
                                  dynamic_mix.separator_args,
                                  dynamic_mix.bitrate,
                                  settings.CPU_SEPARATION)

        # Non-local filesystems like S3/Azure Blob do not support source_path()
        is_local = settings.DEFAULT_FILE_STORAGE == 'api.storage.FileSystemStorage'
        path = dynamic_mix.source_path(
        ) if is_local else dynamic_mix.source_url()

        # Do separation
        if not settings.CPU_SEPARATION:
            # For GPU separation, do separation in separate process.
            # Otherwise, GPU memory is not automatically freed afterwards
            process_eval = Process(target=separator.separate_into_parts,
                                   args=(path, rel_path))
            process_eval.start()
            try:
                process_eval.join()
            except SoftTimeLimitExceeded as e:
                # Kill process if user aborts task
                process_eval.terminate()
                raise e
        else:
            separator.separate_into_parts(path, rel_path)

        # Check all parts exist
        if exists_all_parts(rel_path):
            rename_all_parts(rel_path, file_prefix, file_suffix)
            dynamic_mix.status = TaskStatus.DONE
            if is_local:
                save_to_local_storage(dynamic_mix, rel_media_path, file_prefix,
                                      file_suffix)
            else:
                save_to_ext_storage(dynamic_mix, rel_path, file_prefix,
                                    file_suffix)
        else:
            raise Exception('Error writing to file')
    except FileNotFoundError as error:
        print(error)
        print('Please make sure you have FFmpeg and FFprobe installed.')
        dynamic_mix.status = TaskStatus.ERROR
        dynamic_mix.error = str(error)
        dynamic_mix.save()
    except SoftTimeLimitExceeded:
        print('Aborted!')
    except Exception as error:
        print(error)
        dynamic_mix.status = TaskStatus.ERROR
        dynamic_mix.error = str(error)
        dynamic_mix.save()
Example #19
0
 def run(self):
     p = Process(target=self._crawl)
     p.start()
     p.join()
Example #20
0
    def _process_harvest_response(self, next_response: bytes) -> int:
        """ Processes the harvest response content

        While the last response is being processed, the next one is already loaded to decrease run time

        Args:
            response (bytes): The response as bytes
        Returns:
             number_found_entries (int): The amount of found metadata records in this response
        """
        xml_response = xml_helper.parse_xml(next_response)
        if xml_response is None:
            csw_logger.error(
                "Response is no valid xml. catalogue: {}, startPosition: {}, maxRecords: {}"
                .format(self.metadata.title, self.start_position,
                        self.max_records_per_request))
            # Abort!
            self.start_position = 0
            return

        md_metadata_entries = xml_helper.try_get_element_from_xml(
            "//" + GENERIC_NAMESPACE_TEMPLATE.format("MD_Metadata"),
            xml_response) or []
        next_record_position = int(
            xml_helper.try_get_attribute_from_xml_element(
                xml_response,
                "nextRecord",
                "//" + GENERIC_NAMESPACE_TEMPLATE.format("SearchResults"),
            ))
        self.start_position = next_record_position

        # Fetch found identifiers in parent process, so self.deleted_metadata can be edited easily
        for md_identifier in md_metadata_entries:
            id = xml_helper.try_get_text_from_xml_element(
                md_identifier,
                ".//" + GENERIC_NAMESPACE_TEMPLATE.format("fileIdentifier") +
                "/" + GENERIC_NAMESPACE_TEMPLATE.format("CharacterString"))
            try:
                self.deleted_metadata.remove(id)
            except KeyError:
                pass

        # Delete response to free memory
        del xml_response

        # Process response via multiple processes
        t_start = time()
        num_processes = int(cpu_count() / 2)
        num_processes = num_processes if num_processes >= 1 else 1
        index_step = int(len(md_metadata_entries) / num_processes)
        start_index = 0
        end_index = 0
        self.resource_list = md_metadata_entries
        process_list = []
        for i in range(0, num_processes):
            if index_step < 1:
                end_index = -1
            else:
                end_index += index_step
            p = Process(target=self._create_metadata_from_md_metadata,
                        args=(start_index, end_index))
            start_index += index_step
            process_list.append(p)
        # Close all connections to force each process to create a new one for itself
        connections.close_all()
        execute_threads(process_list)

        csw_logger.debug(
            "Harvesting '{}': runtime for {} metadata parsing: {}s ####".
            format(self.metadata.title, self.max_records_per_request,
                   time() - t_start))
        return len(md_metadata_entries)