def process(url, request, response, data): request = CaseInsensitiveDict.fromdict(json.loads(request)) response = CaseInsensitiveDict.fromdict(json.loads(response)) # verifica se é um content-type válido m = re.match("^\s*([\w-]+/[\w-]+)", response.get("Content-Type", "text/html")) if not m or m.group(1) not in allowed_mimes: return # descobre o tipo de dado na resposta e qual função # irá tratá-la if m.group(1) == "text/html": func = process_html link_type = 1 elif m.group(1).startswith("image/"): func = process_img link_type = 2 elif m.group(1) == "text/plain": func = process_plain link_type = 3 # salva o link sem as informações para fazer as referências necessárias link_url = urlhelper.Url.parse(url) with db.scope(): this_domain_id = db.save_domain(link_url.domain) this_url_id = db.save_url(this_domain_id, link_url.path, m.group(1), link_type) kwarg = {"link_url": link_url, "domain_id": this_domain_id, "url_id": this_url_id} # chama a função específica para tratar esta url func(url, request, response, data, **kwarg)
def test_add_url(self): with db.scope(): domain_id = db.save_domain('google.com') url_id = db.save_url(domain_id, '/google+/index.aspx?user=main#2') self.assertIsInstance(url_id, int)
def test_add_domain(self): with db.scope(): self.assertIsInstance(db.save_domain('google.com'), int)
def process_html(url, request, response, data, **kwarg): data = str(data, "utf-8", "ignore") try: soup = bs4.BeautifulSoup(data) except: return # remove os scripts, eles só atrapalham for script in soup.find_all("script"): script.decompose() link_url = kwarg["link_url"] this_domain_id = kwarg["domain_id"] this_url_id = kwarg["url_id"] # salva e cria referência para todos os links desta página imgs = soup.find_all("img", src=True) links = soup.find_all("a", href=True) with db.scope(): for img in imgs: img_url = urlhelper.Url.parse(img.get("src"), url) img_title = img.get("title") domain_id = db.save_domain(img_url.domain) url_id = db.save_url(domain_id, img_url.path, None, 2) db.associate(this_url_id, url_id, img_title) for link in links: m = re.match("\s*(\w+):", link.get("href")) if m and m.group(1) not in ("http", "https"): continue link_text = get_text(link).strip() link_url = urlhelper.Url.parse(link.get("href"), url) domain_id = db.save_domain(link_url.domain) url_id = db.save_url(domain_id, link_url.path, None, None) db.associate(this_url_id, url_id, link_text) hs = soup.find_all("h1") hs += soup.find_all("h2") hs += soup.find_all("h3") hs += soup.find_all("h4") hs += soup.find_all("h5") hs += soup.find_all("h6") for hx in hs: if not hx.a or len(hx.get_text()) > 0 and len(hx.a.get_text()) / len(hx.get_text()) < 0.3: header_text = get_text(hx).strip() db.save_header(this_url_id, header_text) output = io.StringIO() outputHtml = io.StringIO() text_elements = crawler.readtext(soup) for el in text_elements: if isinstance(el, bs4.NavigableString): outputHtml.write(str(el) + "\n") output.write(el) elif not el.a or len(el.get_text()) > 0 and len(el.a.get_text()) / len(el.get_text()) < 0.3: outputHtml.write(str(el) + "\n") output.write(get_text(el)) og_title = soup.find("meta", attrs={"property": "og:title"}) if og_title: title = og_title.get("content") else: twitter_title = soup.find("meta", attrs={"name": "twitter:title"}) if twitter_title: title = twitter_title.get("content") else: main_title = soup.find("meta", attrs={"name": "title"}) if main_title: title = main_title.get("content") else: title = get_text(soup.title) og_description = soup.find("meta", attrs={"property": "og:description"}) if og_description: description = og_description.get("content") else: twitter_description = soup.find("meta", attrs={"name": "twitter:description"}) if twitter_description: description = twitter_description.get("content") else: main_description = soup.find("meta", attrs={"name": "description"}) if main_description: description = main_description.get("content") else: description = None try: print("HTML:", this_url_id, url) except: pass db.save_document(this_url_id, title, description, re.sub(" +", " ", output.getvalue()), outputHtml.getvalue())