Beispiel #1
0
def clone_with_timeout(src: str, dest: str, clone_func: Callable[[], None],
                       timeout: float) -> None:
    """Clone a repository with timeout.

    Args:
        src: clone source
        dest: clone destination
        clone_func: callable that does the actual cloning
        timeout: timeout in seconds
    """
    errors: Queue = Queue()
    process = Process(target=_clone_task, args=(clone_func, errors))
    process.start()
    process.join(timeout)

    if process.is_alive():
        process.terminate()
        # Give it literally a second (in successive steps of 0.1 second),
        # then kill it.
        # Can't use `process.join(1)` here, billiard appears to be bugged
        # https://github.com/celery/billiard/issues/270
        killed = False
        for _ in range(10):
            time.sleep(0.1)
            if not process.is_alive():
                break
        else:
            killed = True
            os.kill(process.pid, signal.SIGKILL)
        raise CloneTimeout(src, timeout, killed)

    if not errors.empty():
        raise CloneFailure(src, dest, errors.get())
 def _reflash(self, path):
     """
     this will execute the upgrade operation in another process
     because the SSH connection may hang indefinitely while reflashing
     and would block the program; setting a timeout to `exec_command`
     doesn't seem to take effect so at least we can stop the process
     using `subprocess.join(timeout=self.UPGRADE_TIMEOUT)`
     """
     def upgrade(conn, path, timeout):
         conn.connect()
         conn.exec_command('sysupgrade -v -c {0}'.format(path),
                           timeout=timeout)
         conn.close()
     subprocess = Process(
         target=upgrade,
         args=[self, path, self.UPGRADE_TIMEOUT]
     )
     subprocess.start()
     self.log('Upgrade operation in progress...')
     subprocess.join(timeout=self.UPGRADE_TIMEOUT)
     self.log('SSH connection closed, will wait {0} seconds before '
              'attempting to reconnect...'.format(self.SLEEP_TIME))
     sleep(self.SLEEP_TIME)
     # kill the subprocess if it has hanged
     if subprocess.is_alive():
         subprocess.terminate()
         subprocess.join()
Beispiel #3
0
    def _reflash(self, path):
        """
        this will execute the upgrade operation in another process
        because the SSH connection may hang indefinitely while reflashing
        and would block the program; setting a timeout to `exec_command`
        doesn't seem to take effect on some OpenWRT versions
        so at least we can stop the process using
        `subprocess.join(timeout=self.UPGRADE_TIMEOUT)`
        """
        self.disconnect()
        command = self.get_upgrade_command(path)

        def upgrade(conn, path, timeout):
            conn.connect()
            conn.exec_command(command, timeout=timeout)
            conn.disconnect()

        subprocess = Process(target=upgrade,
                             args=[self, path, self.UPGRADE_TIMEOUT])
        subprocess.start()
        self.log('Upgrade operation in progress...')
        subprocess.join(timeout=self.UPGRADE_TIMEOUT)
        self.log(
            f'SSH connection closed, will wait {self.RECONNECT_DELAY} seconds before '
            'attempting to reconnect...')
        sleep(self.RECONNECT_DELAY)
        # kill the subprocess if it has hanged
        if subprocess.is_alive():
            subprocess.terminate()
            subprocess.join()
Beispiel #4
0
 def test_set_pdeathsig(self):
     return_pid = Value('i')
     p = Process(target=parent_task, args=(return_pid,))
     p.start()
     sleep(3) # wait for setting pdeathsig
     p.terminate()
     sleep(3) # wait for process termination
     with pytest.raises(psutil.NoSuchProcess):
         proc = psutil.Process(return_pid.value)
Beispiel #5
0
 def test_set_pdeathsig(self):
     return_pid = Value('i')
     p = Process(target=parent_task, args=(return_pid, ))
     p.start()
     sleep(3)  # wait for setting pdeathsig
     p.terminate()
     sleep(3)  # wait for process termination
     with pytest.raises(psutil.NoSuchProcess):
         proc = psutil.Process(return_pid.value)
Beispiel #6
0
 def test_set_pdeathsig(self):
     success = "done"
     q = Queue()
     p = Process(target=parent_task, args=(q, success))
     p.start()
     child_proc = psutil.Process(q.get(timeout=3))
     try:
         p.terminate()
         assert q.get(timeout=3) == success
     finally:
         child_proc.terminate()
 def crawl_endpoint_to_file(
         self,
         ip_address=None,
         port=None,
         hostname=None,
         use_ssl=False,
         use_sni=False,
         start_urls=[],
         in_separate_process=True,
 ):
     """
     Start crawling the given endpoint using the given list of URLs and write the results to
     a local file.
     :param ip_address: The IP address to crawl.
     :param port: The port where the application resides.
     :param hostname: The hostname to submit alongside all requests to the remote endpoint.
     :param use_ssl: Whether or not to use SSL to connect to the remote web service.
     :param use_sni: Whether or not to use SNI to connect to the remote web service.
     :param start_urls: A list of URLs to start crawling from.
     :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
     enables us to call this method multiple times in the same process, as a Twisted reactor can only
     be started and stopped once per process.
     :return: A tuple containing (1) the string containing the local file path where crawling
     results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
     """
     temp_file_path = FilesystemHelper.get_temporary_file_path()
     local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
     spider_kwargs = {
         "input_ip_address": ip_address,
         "input_start_urls": start_urls,
         "input_file_path": local_file_path,
         "input_hostname": hostname,
         "input_use_ssl": use_ssl,
         "input_use_sni": use_sni,
         "input_port": port,
     }
     pipeline_settings = self.__get_local_storage_item_pipeline()
     requested_hostname = hostname if hostname is not None else ip_address
     settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
     crawling_config = {
         "spider_kwargs": spider_kwargs,
         "settings": settings,
     }
     if in_separate_process:
         process = Process(target=self.__crawl, kwargs=crawling_config)
         process.start()
         process.join()
         process.terminate()
     else:
         self.__crawl(**crawling_config)
     return local_file_path, ScrapyResultWrapper.from_file(local_file_path)
Beispiel #8
0
class LongCalculation(QProgressDialog):
    """
    Multiprocessing based worker for mesh and eigenvalue calculations.

    This is necessary to make sure GUI is not blocked while mesh is built,
    or when eigenvalue calculations are performed.

    Transformations do not need as much time, unless there is one implemented
    without numpy vectorized coordinate calculations.
    """

    res = None

    def __init__(self, fun, args, postprocess, job):
        """ Build multiprocessing queues and start worker. """
        super(LongCalculation, self).__init__(job, "Cancel", 0, 0)
        self.setModal(True)
        self.input = Queue()
        self.output = Queue()
        self.input.put((fun, args, postprocess))
        self.proc = Process(target=worker, args=(self.input, self.output))
        self.proc.start()
        self.timer = QTimer()
        self.timer.timeout.connect(self.update)
        self.timer.start(10)

    def update(self):
        """ Check if worker is done, and close dialog. """
        try:
            out = self.output.get(block=False)
            if isinstance(out, basestring):
                self.setLabelText(out)
                return
            if out is None:
                self.done(0)
                return
            self.res = out
            self.timer.stop()
            self.proc.join()
            del self.proc
            self.done(1)
        except:
            pass

    def cleanUp(self):
        """ Kill the running processes if cancelled/failed. """
        if self.proc:
            while self.proc.is_alive():
                self.proc.terminate()
            del self.proc
        self.timer.stop()
Beispiel #9
0
def main(param):
    KILLING = False

    KILLING2 = False
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    data = setParam(param=param)
    if __name__ == "__main__":
        printParam(data)

    logger.critical(
        f"######################### NEW JOB: {data.folder} #########################"
    )

    # file_handler_job = logging.FileHandler(os.path.join(data.outputDir, 'log'))
    # file_handler_job.setLevel(logging.INFO)
    # file_handler_job.setFormatter(formatter)
    # logger.addHandler(file_handler_job)

    processes = []
    p = Process(target=processMaster, args=(data, ), name="MASTER")
    p.start()
    processes.append(p)
    logger.info('[i] MASTER started')

    for n in range(data.workers):
        p = Process(target=processWorker, args=(data, ), name="WORKER")
        p.start()
        processes.append(p)
        logger.info(f'[i] WORKER {n+1}/{data.workers} started')

    p = Process(target=processSink, args=(data, ), name="SINK")
    p.start()
    processes.append(p)
    logger.info(f'[i] Sink started')
    ksObj = KillSwitch(data)

    def killswitch(a, b):
        ksObj()
        ksObj()

    signal.signal(signal.SIGINT, killswitch)
    emails, nmbPgsScraped = Producer(data).workYouBastard()

    logger.info("[i] Finalising, \t")
    for p in processes:
        p.terminate()
    logger.info('[i] Done')

    return emails, nmbPgsScraped
Beispiel #10
0
    def run(self, jobs):
        '''Start the Scrapy engine, and execute all jobs.  Return consolidated results
        in a single list.

        Parms:
          jobs ([Job]) - one or more Job objects to be processed.

        Returns:
          List of objects yielded by the spiders after all jobs have run.
        '''
        if not isinstance(jobs, collections.Iterable):
            jobs = [jobs]
        self.validate(jobs)

        p = Process(target=self._crawl, args=[jobs])
        p.start()
        p.join()
        p.terminate()
Beispiel #11
0
    def run(self, jobs):
        '''Start the Scrapy engine, and execute all jobs.  Return consolidated results
        in a single list.

        Parms:
          jobs ([Job]) - one or more Job objects to be processed.

        Returns:
          List of objects yielded by the spiders after all jobs have run.
        '''
        if not isinstance(jobs, collections.Iterable):
            jobs = [jobs]
        self.validate(jobs)

        p = Process(target=self._crawl, args=[jobs])
        p.start()
        p.join()
        p.terminate()

        return self.results.get()
Beispiel #12
0
class CrawlProcess():

    _model = Task
    __instance = None
    count = 0

    @staticmethod
    def get_instance():
        """ Static access method. """
        print(CrawlProcess.__instance)
        if CrawlProcess.__instance == None:
            CrawlProcess()
        return CrawlProcess.__instance

    def __init__(self):
        if CrawlProcess.__instance != None:
            raise Exception("This class is a singleton!")
        else:
            CrawlProcess.__instance = self
            #self.process = Process(target=CrawlProcess.crawl)
            self.process = None
            self.crawler_process = None
            self.task = None
            self.q = Queue()
            self.parent_conn, self.child_conn = Pipe()



    #@classmethod
    #def crawl(cls, q):
    #@classmethod
    #def crawl(cls, process, q):
    @classmethod
    def crawl(cls, q, conn):
        print()
        print()
        print('***************************************************************************************')
        print('crawl')

        def close(spider, reason):
            print(f'{multiprocessing.current_process().name}: *!!CLOSE')
            write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt')
            t = Task.objects.get_latest_crawler_task()
            d = datetime.today()
            t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}'
            t.result = CrawlProcess.count
            t.save()

        def open(spider):
            print(f'{multiprocessing.current_process().name}: *!!OPEN')
            try:
                name = spider.name
            except:
                name = str(spider)
            write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt')
            CrawlProcess.count = 0
            try:
                t = Task.objects.get_latest_crawler_task()
                t.name = str(process.pid)
                t.save()
            except Exception as e:
                t.name = e
                t.save()
            #q.put_nowait()
            print()


        def scraped(item, response, spider):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')

            print()
            CrawlProcess.count = CrawlProcess.count + 1
            n = CrawlProcess.count
            write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt')
            try:
                q.get_nowait()
                q.put_nowait(n)
            except:
                q.put_nowait(n)

        def stopped(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def error(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def send_by_pipe(item):
            try:
                conn.send(item)
                #conn.close()
            except Exception as e:
                write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt')

        process = CrawlerProcess(get_project_settings())
        write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)},
                        'debug.txt')
        send_by_pipe(process)
        write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        process.crawl(InfoempleoSpider())
        write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        crawler = Crawler(InfoempleoSpider())
        crawler.signals.connect(open, signal=signals.spider_opened)
        crawler.signals.connect(scraped, signal=signals.item_scraped)
        crawler.signals.connect(close, signal=signals.spider_closed)
        crawler.signals.connect(stopped, signal=signals.engine_stopped)
        crawler.signals.connect(error, signal=signals.spider_error)

        write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt')

        process.crawl(crawler)
        write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        process.start()
        write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        print('***************************************************************************************')
        print(f'CrawlerProcess: {process}')
        print(dir(process))
        print('***************************************************************************************')
        print()
        print()
        write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt')
        process.join()
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt')
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt')

        print('Crawler Process has Finished!!!!!')



    @classmethod
    def crawl2(cls, q):
        while CrawlProcess.count < 15:
           # print(f'doing something: {CrawlProcess.count}')
            CrawlProcess.count = CrawlProcess.count + 1
            n = CrawlProcess.count
            try:
                q.get_nowait()
            except:
                pass
            q.put(n)
            if CrawlProcess.count % 5 == 0:
               # print(f'qsize: {q.qsize()}')
                time.sleep(5)


    def _clear_queue(self):
        while not self.q.empty():
            self.q.get_nowait()



    def _init_process(self, user):
        print(f'CrawlerProcess.init_process')
        self.q.put_nowait(0)
        self.process = Process(target=CrawlProcess.crawl, args=(self.q, self.child_conn,))
        self.task = Task(user=user, state=Task.STATE_PENDING, type=Task.TYPE_CRAWLER)



    def _start_process(self):
        print(f'CrawlerProcess._start_process')
        self.init_datetime = timezone.now()  # Before create the task
        self.process.start()
        self.task.pid = self.process.pid
        write_in_a_file('CrawlProcess._start_process: process started', {'pid': self.process.pid}, 'debug.txt')
        self.task.state = Task.STATE_RUNNING
        self.task.save()
        self.crawler_process = self.parent_conn.recv()
        write_in_a_file('CrawlProcess._start_process: conn.recv', {'crawler_process':str(self.crawler_process), 'dir crawler_process':dir(self.crawler_process)}, 'debug.txt')
        write_in_a_file('CrawlProcess._start_process', {'CrawlerProcess': str(self.crawler_process), 'dir(CrawlerProcess)': dir(self.crawler_process)},'task.txt')


    def _reset_process(self, state=Task.STATE_FINISHED):
        print(f'CrawlerProcess._reset_process({state})')
        try:
            self.process.terminate()
            write_in_a_file('_reset_process terminated (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt')
            self.task.result = CrawlProcess.count
            self.task.state = state
            self.task.save()
            self.process.join()  # ! IMPORTANT after .terminate -> .join
            write_in_a_file('_reset_process joinned (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt')
        except:
            pass
        try:
            self.result = self.q.get_nowait()
        except Exception as e:
            pass
        self._clear_queue()


    def _update_process(self):
        print('CrawlerProcess._update_process')
        print(f'process is alive: {self.process and self.process.is_alive()}')
        if self.process and not self.process.is_alive():
            self._reset_process()


    def start(self, user=None, **kwargs):
        """
        Si el proceso no está vivo es que o no se ha iniciado aún o que ya ha terminado, así que
        se guardan los datos almacenados y se ejecuta el proceso.
        Si el proceso está vivo no se hace nada.

        :param user: The uses that make the request
        :param kwargs:
        :return:
        """
        print(f'self.q.empty(): {self.q.empty()}')
        print(f'self.q.qsize(): {self.q.qsize()}')

        if not self.is_scrapping():
            if self.task and (self.task.state == Task.STATE_RUNNING):
                self._reset_process()
            self._init_process(user)
            self._start_process()

    def stop(self):
        print(f'CrawleProcess.stop')
        self._reset_process(Task.STATE_INCOMPLETE)
       # self.crawler_process.stop()
        #self.crawler_process.join()


    def join(self):
        self.process.join()

    def get_actual_task(self):
        self._update_process()
        return self.task

    def get_latest_task(self):
        last_task = Task.objects.get_latest_crawler_task()
        # If the latest task from de db has state equal STATE_RUNNING and not is the actual task will be an incomplete task...
        #... and would have to update its state
        is_an_incomplete_task = (
                last_task and
                last_task.state == Task.STATE_RUNNING and
                (not self.task or self.task.pk != last_task.pk)
        )
        if is_an_incomplete_task:
            last_task.state = Task.STATE_INCOMPLETE
            last_task.save()
        return last_task

    def is_scrapping(self):
        print(CrawlProcess.is_scrapping)
        if self.process:
            return self.process.is_alive()
        else:
            return False

    def _get_scraped_jobs(self):
        latest_task = Task.objects.get_latest_crawler_task()
        return Job.objects.filter(Q(created_at__gte=latest_task.created_at) | Q(updated_at__gte=latest_task.created_at))

    def get_scraped_items_number(self):
        print()
        print('!!!! CrawlProcess.get_scraped_items_number');print();
        count = CrawlProcess.count
        try:
            print(self.q)
            #print(f'CrawlProcess.count: {CrawlProcess.count}')
            #print(f'qsize: {self.q.qsize()}')
            count = self.q.get(block=True, timeout=5)
            CrawlProcess.count = count
            print(f'q.count: {count}')
        except Exception as e:
            print(f'get_scraped_items_number')
           # save_error(e, {'count': count})
        return count


    def get_scraped_items_percentage(self):
        # Calcula el total con los items scrapeados de la tarea enterior
        count = self.get_scraped_items_number()
        task = Task.objects.get_latest_finished_crawler_task()
        if task:
            old_result = task.result or 20000
        else:
            old_result = 20000

        if count < old_result:
            total = old_result
        else:
            total = count
        db_count = self._get_scraped_jobs().count()

        try:
            percentage = round(db_count/total, 2)
        except:
            percentage = 0

        if  percentage >= 0.95 and self.is_scrapping():
            percentage = 0.95

        return percentage