def worker(domain): while True: # if LINKS_QUEUE.qsize() == 0: # sleep(10) # if LINKS_QUEUE.qsize() == 0: # break # continue url = LINKS_QUEUE.get() SCANNED_LINKS.add(url) try: with HTMLSession() as session: resp = session.get(url) assert resp.status_code == 200 except Exception as e: print(e, type(e)) continue try: page_title = resp.html.xpath('//title')[0].text except IndexError: page_title = 'Not Found' try: page_h1 = resp.html.xpath('//h1')[0].text except IndexError: page_h1 = 'Not Found' Page.create(url=url, title=page_title, h1=page_h1) print('[OK]', url) with locker: with open('results.csv', 'a') as f: f.write(f'{url}\t{page_title}\t{page_h1}\n') for link in resp.html.absolute_links: link = link.split('#')[0] if domain not in link: continue if link in SCANNED_LINKS: continue if any(part in link for part in BAD_PARTS): continue LINKS_QUEUE.put(link)
def get_page(self, url): normalized_url = self.normalize(url) if normalized_url not in self.pages: headers = requests.head(url) content_type = headers.headers.get('content-type', '') if "text/html" in content_type: try: page = requests.get(url) except Exception as e: logging.error(f"Requests get exception: {e}") Page.create(id=self.id, url=normalized_url, status=headers.status_code, content_type=content_type, links=json.dumps([])) self.id += 1 return logging.debug(f"Got {url} [{page.status_code}]") try: page_content = get_page_source(url) except Exception as e: logging.error(f"Got selenium error: [{e}]") page_content = page.content links = [ self.normalize(link) for link in self.parse_page(page_content) ] Page.create(id=self.id, url=normalized_url, status=page.status_code, content_type=content_type, links=json.dumps(links)) self.pages[normalized_url] = None for link in links: if link not in self.pages: self.queue.add(link) else: logging.debug(f"Add {url} with content_type: {content_type}") Page.create(id=self.id, url=normalized_url, status=headers.status_code, content_type=content_type, links=json.dumps({})) self.id += 1