def run(self): main = None for t in tenum(): if t.name == 'MainThread': main = t break if not main: print('Main thread not existing') return while self.alive and main and main.is_alive(): if time.time() - self.last_save > 1: current_checksum = self.hash() if not self.last_checksum or self.last_checksum != current_checksum: save_memory_db() self.last_checksum = current_checksum self.last_save = time.time() time.sleep(0.025)
def consumer_worker(self): db = self._connect_to_db() do_commit = False with self.lock: urls_last = self.urls_crawled while not self.crawl_running.is_set(): ts = time() with self.lock: if self.url_queue.empty() and self.data_queue.empty() and all( [not i.is_set() for i in self.worker_status]): self.crawl_running.set() self.crawl_completed.set() break after_lock = time() - ts try: # print(f"Queue size: {self.data_queue.qsize()}") wait_before = time() response = self.data_queue.get() wait_after = time() - wait_before except queue.Empty: print("Consumer thread timed out") self.crawl_running.set() self.crawl_timed_out.set() for t in tenum(): if "worker-" in t.name: self.url_queue.put("END") break response_to_data_time = 0 if isinstance(response, dict): data = response else: before = time() data = self.response_to_data(response) response_to_data_time = time() - before crawl_data = data['data'] before_insert = time() new, updated = db.insert_new_data(crawl_data) after_insert = time() - before_insert before_gui = time() with self.lock: self.urls_crawled += len(updated) + len(new) self.urls_total += len(new) if self.gui_mode: if new or updated: self.add_to_gui_queue(new + updated) after_gui = time() - before_gui before_links = time() extracted_links = data.get("links", []) + data.get( "hreflang_links", []) + data.get( "canonical_links", []) + data.get("pagination_links", []) after_links = 0 after_inlink = 0 if len(extracted_links) > 0: new_urls = db.get_new_urls(extracted_links) if len(new_urls) > 0: db.insert_new_urls(new_urls) self.add_to_url_queue(new_urls) after_links = time() - before_links inlink_before = time() if "unique_inlinks" in self.settings.get("CRAWL_ITEMS", ""): db.insert_inlinks(extracted_links, data['url']) after_inlink = time() - inlink_before with self.lock: if self.urls_crawled - urls_last >= 100: do_commit = True urls_last = self.urls_crawled after_commit = 0 before_commit = time() if do_commit: db.commit() do_commit = False after_commit = time() - before_commit # print(f"Iteration took {time() - ts:.2f} sec | waited # {wait_after:.2f} sec | response_to_data # {response_to_data_time:.2f} sec | insert took {after_insert:.2f} # sec | commit took {after_commit:.2f} | links took # {after_links:.2f}| inlinks took {after_inlink:.2f} sec | gui took # {after_gui:.2f} | locked for {after_lock:.2f} secs") # Outside while loop, wrap things up self.crawl_running.set() # Empty our URL Queue first with self.url_queue.mutex: self.url_queue.queue.clear() # Add signals for our waiting workers that they are done for today [ self.url_queue.put("END") for _ in range(int(self.settings["THREADS"])) ] # Always commit to db at the very end db.commit() db.close() self.session.close() print("Consumer thread finished")
def wait_for_threads(self): ts = tenum() for t in ts: if "worker-" in t.name: t.join() print("All workers joined ...")
def run(self, *args, **kwargs): main = None for t in tenum(): if t.name == 'MainThread': main = t break if not main: print('Main thread not existing') return self.cmd = shlex.split(self.raw_cmd) self.exec_dir = f'{self.cwd}/{os.path.basename(self.cmd[0])}_workingdir' if not self.cmd[0][0] == '/': o = sys_command('/usr/bin/which {}'.format(self.cmd[0])).strip() self.cmd[0] = o.decode('UTF-8') if not os.path.isdir(self.exec_dir): os.makedirs(self.exec_dir) if self.start_callback: self.start_callback(self, *args, **kwargs) self.status = 'running' old_dir = os.getcwd() os.chdir(self.exec_dir) self.pid, child_fd = pty.fork() if not self.pid: # Child process # Replace child process with our main process os.execv(self.cmd[0], self.cmd) os.chdir(old_dir) poller = epoll() poller.register(child_fd, EPOLLIN | EPOLLHUP) self.alive = True last_trigger_pos = 0 while self.alive and main and main.is_alive(): for fileno, event in poller.poll(0.1): try: output = os.read(child_fd, 8192).strip() self.trace_log += output except OSError: self.alive = False break lower = output.lower() broke = False if 'events' in self.kwargs: for trigger in list(self.kwargs['events']): if trigger.lower() in self.trace_log[last_trigger_pos:].lower(): trigger_pos = self.trace_log[last_trigger_pos:].lower().find(trigger.lower()) last_trigger_pos = trigger_pos os.write(child_fd, self.kwargs['events'][trigger]) del(self.kwargs['events'][trigger]) broke = True break if broke: continue ## Adding a exit trigger: if len(self.kwargs['events']) == 0: if bytes(f']$'.lower(), 'UTF-8') in self.trace_log[0-len(f']$')-5:].lower(): self.alive = False break self.status = 'done' self.alive = False try: self.exit_code = os.waitpid(self.pid, 0)[1] except ChildProcessError: try: self.exit_code = os.waitpid(child_fd, 0)[1] except ChildProcessError: self.exit_code = 1 self.ended = time.time() with open(f'{self.cwd}/trace.log', 'wb') as fh: fh.write(self.trace_log) if self.callback: self.callback(self, *self.args, **self.kwargs) if self.exit_code != 0: print(f'Process {self.cmd[0]} has exited with {self.exit_code}.') print(self.trace_log) return self.exit_code
def notify_crawl_workers_to_stop(self) -> None: """ Notifies all crawl workers to stop by inserting an END element into the URL queue.""" for t in tenum(): if 'worker-' in t.name: self.url_queue.put('END')
def wait_for_workers(self) -> None: """Waits for all worker threads to join/finish.""" for t in tenum(): if 'worker-' in t.name: t.join() print('All workers joined ...')