Esempio n. 1
0
	def run(self):
		main = None
		for t in tenum():
			if t.name == 'MainThread':
				main = t
				break

		if not main:
			print('Main thread not existing')
			return
		
		while self.alive and main and main.is_alive():
			if time.time() - self.last_save > 1:
				current_checksum = self.hash()
				if not self.last_checksum or self.last_checksum != current_checksum:
					save_memory_db()
					self.last_checksum = current_checksum
				self.last_save = time.time()
			time.sleep(0.025)
Esempio n. 2
0
    def consumer_worker(self):
        db = self._connect_to_db()
        do_commit = False
        with self.lock:
            urls_last = self.urls_crawled

        while not self.crawl_running.is_set():
            ts = time()
            with self.lock:
                if self.url_queue.empty() and self.data_queue.empty() and all(
                    [not i.is_set() for i in self.worker_status]):
                    self.crawl_running.set()
                    self.crawl_completed.set()
                    break
            after_lock = time() - ts
            try:
                # print(f"Queue size: {self.data_queue.qsize()}")
                wait_before = time()
                response = self.data_queue.get()
                wait_after = time() - wait_before
            except queue.Empty:
                print("Consumer thread timed out")
                self.crawl_running.set()
                self.crawl_timed_out.set()
                for t in tenum():
                    if "worker-" in t.name:
                        self.url_queue.put("END")
                break

            response_to_data_time = 0
            if isinstance(response, dict):
                data = response
            else:
                before = time()
                data = self.response_to_data(response)
                response_to_data_time = time() - before

            crawl_data = data['data']

            before_insert = time()
            new, updated = db.insert_new_data(crawl_data)
            after_insert = time() - before_insert

            before_gui = time()
            with self.lock:
                self.urls_crawled += len(updated) + len(new)
                self.urls_total += len(new)
            if self.gui_mode:
                if new or updated:
                    self.add_to_gui_queue(new + updated)
            after_gui = time() - before_gui

            before_links = time()
            extracted_links = data.get("links", []) + data.get(
                "hreflang_links", []) + data.get(
                    "canonical_links", []) + data.get("pagination_links", [])
            after_links = 0
            after_inlink = 0

            if len(extracted_links) > 0:
                new_urls = db.get_new_urls(extracted_links)

                if len(new_urls) > 0:
                    db.insert_new_urls(new_urls)
                    self.add_to_url_queue(new_urls)
                after_links = time() - before_links

                inlink_before = time()
                if "unique_inlinks" in self.settings.get("CRAWL_ITEMS", ""):
                    db.insert_inlinks(extracted_links, data['url'])
                after_inlink = time() - inlink_before

            with self.lock:
                if self.urls_crawled - urls_last >= 100:
                    do_commit = True
                    urls_last = self.urls_crawled

            after_commit = 0
            before_commit = time()
            if do_commit:
                db.commit()
                do_commit = False
                after_commit = time() - before_commit

            # print(f"Iteration took {time() - ts:.2f} sec | waited
            # {wait_after:.2f} sec | response_to_data
            # {response_to_data_time:.2f} sec | insert took {after_insert:.2f}
            # sec | commit took {after_commit:.2f} | links took
            # {after_links:.2f}| inlinks took {after_inlink:.2f} sec | gui took
            # {after_gui:.2f} | locked for {after_lock:.2f} secs")

        # Outside while loop, wrap things up
        self.crawl_running.set()

        # Empty our URL Queue first
        with self.url_queue.mutex:
            self.url_queue.queue.clear()
        # Add signals for our waiting workers that they are done for today
        [
            self.url_queue.put("END")
            for _ in range(int(self.settings["THREADS"]))
        ]

        # Always commit to db at the very end
        db.commit()
        db.close()

        self.session.close()
        print("Consumer thread finished")
Esempio n. 3
0
 def wait_for_threads(self):
     ts = tenum()
     for t in ts:
         if "worker-" in t.name:
             t.join()
     print("All workers joined ...")
Esempio n. 4
0
	def run(self, *args, **kwargs):
		main = None
		for t in tenum():
			if t.name == 'MainThread':
				main = t
				break

		if not main:
			print('Main thread not existing')
			return

		self.cmd = shlex.split(self.raw_cmd)
		self.exec_dir = f'{self.cwd}/{os.path.basename(self.cmd[0])}_workingdir'

		if not self.cmd[0][0] == '/':
			o = sys_command('/usr/bin/which {}'.format(self.cmd[0])).strip()
			self.cmd[0] = o.decode('UTF-8')

		if not os.path.isdir(self.exec_dir):
			os.makedirs(self.exec_dir)

		if self.start_callback: self.start_callback(self, *args, **kwargs)
		
		self.status = 'running'
		old_dir = os.getcwd()
		os.chdir(self.exec_dir)
		self.pid, child_fd = pty.fork()
		if not self.pid: # Child process
			# Replace child process with our main process
			os.execv(self.cmd[0], self.cmd)
		os.chdir(old_dir)

		poller = epoll()
		poller.register(child_fd, EPOLLIN | EPOLLHUP)

		self.alive = True
		last_trigger_pos = 0
		while self.alive and main and main.is_alive():
			for fileno, event in poller.poll(0.1):
				try:
					output = os.read(child_fd, 8192).strip()
					self.trace_log += output
				except OSError:
					self.alive = False
					break

				lower = output.lower()
				broke = False
				if 'events' in self.kwargs:
					for trigger in list(self.kwargs['events']):
						if trigger.lower() in self.trace_log[last_trigger_pos:].lower():
							trigger_pos = self.trace_log[last_trigger_pos:].lower().find(trigger.lower())

							last_trigger_pos = trigger_pos
							os.write(child_fd, self.kwargs['events'][trigger])
							del(self.kwargs['events'][trigger])
							broke = True
							break

					if broke:
						continue

					## Adding a exit trigger:
					if len(self.kwargs['events']) == 0:

						if bytes(f']$'.lower(), 'UTF-8') in self.trace_log[0-len(f']$')-5:].lower():
							self.alive = False
							break

		self.status = 'done'
		self.alive = False

		try:
			self.exit_code = os.waitpid(self.pid, 0)[1]
		except ChildProcessError:
			try:
				self.exit_code = os.waitpid(child_fd, 0)[1]
			except ChildProcessError:
				self.exit_code = 1

		self.ended = time.time()
		with open(f'{self.cwd}/trace.log', 'wb') as fh:
			fh.write(self.trace_log)

		if self.callback:
			self.callback(self, *self.args, **self.kwargs)

		if self.exit_code != 0:
			print(f'Process {self.cmd[0]} has exited with {self.exit_code}.')
			print(self.trace_log)

		return self.exit_code
Esempio n. 5
0
    def notify_crawl_workers_to_stop(self) -> None:
        """ Notifies all crawl workers to stop by inserting an END element into the URL queue."""

        for t in tenum():
            if 'worker-' in t.name:
                self.url_queue.put('END')
Esempio n. 6
0
 def wait_for_workers(self) -> None:
     """Waits for all worker threads to join/finish."""
     for t in tenum():
         if 'worker-' in t.name:
             t.join()
     print('All workers joined ...')