Esempio n. 1
0
def get_top_headers(domain):
	with db.scope():
		result = db.get_top_headers(domain)
		if result:
			return { 'success': result }

	return {}
Esempio n. 2
0
def get_domain_links(domain):
	with db.scope():
		links = db.get_links(domain)
		if links:
			return { 'success': links }

	return {}
Esempio n. 3
0
def get_text(urlId):
	with db.scope():
		document = db.get_url_document(urlId)
		if document:
			return { 'success': document }

	return {}
Esempio n. 4
0
def getwebsiteinfo(domain):
	with db.scope():
		domain = db.correct_name(domain)
		if domain:
			linksTo = db.get_link_to(domain)
			linkedBy = db.get_linked_by(domain)

			x = {
				'domain': domain,
				'linksTo': {},
				'linkedBy': {},
				'details': {
					'links': db.get_count_local_links(domain),
					'foreignLinks': db.get_count_foreign_links(domain),
					'images': db.get_count_images(domain),
					'mainCategories': ['Geral']
				}
			}

			for link in linksTo:
				x['linksTo'][link] = db.get_count_local_links(link)

			for link in linkedBy:
				x['linkedBy'][link] = db.get_count_foreign_links(link)

			return x
Esempio n. 5
0
def process(url, request, response, data):
    request = CaseInsensitiveDict.fromdict(json.loads(request))
    response = CaseInsensitiveDict.fromdict(json.loads(response))

    # verifica se é um content-type válido
    m = re.match("^\s*([\w-]+/[\w-]+)", response.get("Content-Type", "text/html"))
    if not m or m.group(1) not in allowed_mimes:
        return

        # descobre o tipo de dado na resposta e qual função
        # irá tratá-la
    if m.group(1) == "text/html":
        func = process_html
        link_type = 1
    elif m.group(1).startswith("image/"):
        func = process_img
        link_type = 2
    elif m.group(1) == "text/plain":
        func = process_plain
        link_type = 3

        # salva o link sem as informações para fazer as referências necessárias
    link_url = urlhelper.Url.parse(url)
    with db.scope():
        this_domain_id = db.save_domain(link_url.domain)
        this_url_id = db.save_url(this_domain_id, link_url.path, m.group(1), link_type)

    kwarg = {"link_url": link_url, "domain_id": this_domain_id, "url_id": this_url_id}

    # chama a função específica para tratar esta url
    func(url, request, response, data, **kwarg)
Esempio n. 6
0
def get_image(urlId):
	response.content_type = 'image/jpeg'
	with db.scope():
		blob = db.get_image(urlId)
		if blob:
			return blob

	abort(404, 'Imagem inexistente.')
Esempio n. 7
0
def process_img(url, request, response, data, **kwarg):
    this_url_id = kwarg["url_id"]

    try:
        print("IMAGE:", this_url_id, url)
    except:
        pass

    blob = images.resize(140, 140, data)
    if blob:
        with db.scope():
            db.save_image(this_url_id, blob)
Esempio n. 8
0
def get_domain_images(name):
	with db.scope():
		images = db. get_domain_images(name)
		return { 'success': images }
Esempio n. 9
0
def get_url_images(urlId):
	with db.scope():
		images = db. get_images(urlId)
		return { 'success': images }
Esempio n. 10
0
def typeaheaddomain(query):
	with db.scope():
		result = db.querydomain(query)
		return { 'options': result }

	return {}
Esempio n. 11
0
	def test_add_url(self):
		with db.scope():
			domain_id = db.save_domain('google.com')
			url_id = db.save_url(domain_id, '/google+/index.aspx?user=main#2')
			self.assertIsInstance(url_id, int)
Esempio n. 12
0
	def test_add_domain(self):
		with db.scope():
			self.assertIsInstance(db.save_domain('google.com'), int)
Esempio n. 13
0
def process_html(url, request, response, data, **kwarg):
    data = str(data, "utf-8", "ignore")
    try:
        soup = bs4.BeautifulSoup(data)
    except:
        return

        # remove os scripts, eles só atrapalham
    for script in soup.find_all("script"):
        script.decompose()

    link_url = kwarg["link_url"]
    this_domain_id = kwarg["domain_id"]
    this_url_id = kwarg["url_id"]

    # salva e cria referência para todos os links desta página
    imgs = soup.find_all("img", src=True)
    links = soup.find_all("a", href=True)
    with db.scope():
        for img in imgs:
            img_url = urlhelper.Url.parse(img.get("src"), url)
            img_title = img.get("title")

            domain_id = db.save_domain(img_url.domain)
            url_id = db.save_url(domain_id, img_url.path, None, 2)
            db.associate(this_url_id, url_id, img_title)

        for link in links:
            m = re.match("\s*(\w+):", link.get("href"))
            if m and m.group(1) not in ("http", "https"):
                continue

            link_text = get_text(link).strip()
            link_url = urlhelper.Url.parse(link.get("href"), url)

            domain_id = db.save_domain(link_url.domain)
            url_id = db.save_url(domain_id, link_url.path, None, None)
            db.associate(this_url_id, url_id, link_text)

        hs = soup.find_all("h1")
        hs += soup.find_all("h2")
        hs += soup.find_all("h3")
        hs += soup.find_all("h4")
        hs += soup.find_all("h5")
        hs += soup.find_all("h6")

        for hx in hs:
            if not hx.a or len(hx.get_text()) > 0 and len(hx.a.get_text()) / len(hx.get_text()) < 0.3:
                header_text = get_text(hx).strip()
                db.save_header(this_url_id, header_text)

        output = io.StringIO()
        outputHtml = io.StringIO()
        text_elements = crawler.readtext(soup)

        for el in text_elements:
            if isinstance(el, bs4.NavigableString):
                outputHtml.write(str(el) + "\n")
                output.write(el)
            elif not el.a or len(el.get_text()) > 0 and len(el.a.get_text()) / len(el.get_text()) < 0.3:
                outputHtml.write(str(el) + "\n")
                output.write(get_text(el))

        og_title = soup.find("meta", attrs={"property": "og:title"})
        if og_title:
            title = og_title.get("content")
        else:
            twitter_title = soup.find("meta", attrs={"name": "twitter:title"})
            if twitter_title:
                title = twitter_title.get("content")
            else:
                main_title = soup.find("meta", attrs={"name": "title"})
                if main_title:
                    title = main_title.get("content")
                else:
                    title = get_text(soup.title)

        og_description = soup.find("meta", attrs={"property": "og:description"})
        if og_description:
            description = og_description.get("content")
        else:
            twitter_description = soup.find("meta", attrs={"name": "twitter:description"})
            if twitter_description:
                description = twitter_description.get("content")
            else:
                main_description = soup.find("meta", attrs={"name": "description"})
                if main_description:
                    description = main_description.get("content")
                else:
                    description = None

        try:
            print("HTML:", this_url_id, url)
        except:
            pass

        db.save_document(this_url_id, title, description, re.sub(" +", " ", output.getvalue()), outputHtml.getvalue())