Example #1
0
    def test_get_base_url(self):
        baseurl = 'https://example.org'

        text = u"""\
            <html>\
            <head><title>Dummy</title><base href='http://example.org/something' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')

        # relative url with absolute path
        text = u"""\
            <html>\
            <head><title>Dummy</title><base href='/absolutepath' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')

        # no scheme url
        text = """\
            <html>\
            <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
Example #2
0
    def test_get_base_url(self):
        baseurl = u'https://example.org'

        text = u"""\
            <html>\
            <head><title>Dummy</title><base href='http://example.org/something' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
        self.assertEqual(get_base_url(text, baseurl.encode('ascii')), 'http://example.org/something')
def extraer_informacion_web():
    pp = pprint.PrettyPrinter(indent=2)
    r = requests.get(
        'https://www.casadellibro.com/libro-los-renglones-torcidos-de-hollywood/9788412094749/11187413'
    )

    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)

    schema = data['json-ld']

    soup = BeautifulSoup(r.text, 'lxml')

    imagen = soup.find('img', {'class': 'product-image'})

    descripcion = soup.find('div', {'class': 'hidden-sm-and-down'})
    desc = descripcion.text

    desc = desc.replace("Ver más", '').strip()

    if "CRÍTICAS" in desc:
        desc = desc.split("CRÍTICAS")
        desc = desc[0]
    if "Resumen" in desc:
        desc = desc.split("Resumen")
        desc = desc[1]
    obj = {}
    obj['desc'] = desc
    obj['imagen'] = imagen['data-src']
    obj['schema'] = schema
    print(obj)
    print(type(obj))
def get_metadata(html: bytes, url: str):
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(html,
                               base_url=get_base_url(url),
                               syntaxes=['json-ld'],
                               uniform=True)['json-ld']
    return metadata
def get_jsons(url):
    r = requests.get(url)
    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)
    if 'json-ld' not in data:
        raise Exception('No se encuentran datos json-ld')
    return data['json-ld']
Example #6
0
def get_json_ld_description(url):

	description = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return description, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return description, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) #
	except Exception as e:
		error_code = '500'
		return description, error_code

	jl = data['json-ld']

	for l in jl:
		if l.get('description',None):
			description = l['description']

	return description, '200'
Example #7
0
def get_opengraph_description(url):

	description = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return description, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return description, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['opengraph']) #
	except Exception as e:
		error_code = '500'
		return description, error_code

	og = data['opengraph']

	for o in og:
		if o.get('properties',None):
			for i in o['properties']:
				if i[0] == 'og:description':
					description = i[1]

	return description, '200'
Example #8
0
def get_base_url(response):
    """Return the base url of the given response, joined with the response url"""
    if response not in _baseurl_cache:
        text = response.body_as_unicode()[0:4096]
        _baseurl_cache[response] = html.get_base_url(text, response.url, \
            response.encoding)
    return _baseurl_cache[response]
Example #9
0
    def recursive_page_scrape(self, page):
        """
        Crawls across website and scrapes recipes as discovered.
        The navigation is recursive, scraping each page it comes upon and all the links
        on that page.
        """
        # Mark as being read
        print_message = f"Recipes Found: {len(self.scraped_recipes)} | Scraping {page}"
        print(print_message.ljust(200, " "), end="\r", flush=True)
        self.link_library[page] = 1
        self.sdb.update_one({"_id": self.source},
                            {"$set": {
                                "link_library": self.link_library
                            }})

        # Scrape it, if it errors out it won't be tried again
        r = requests.get(page, headers=self.headers)
        soup = BeautifulSoup(r.content, "html.parser")
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)

        # Recipe
        self.scrape_recipe(data, page)

        # Look for all links on page
        for link_string in soup.findAll(
                'a', attrs={'href': re.compile("^https://")}):
            link = clean_url(link_string.get('href'), self.utm_pages)
            if check_link(link, self.domain):
                if link not in self.link_library.keys():
                    wait()
                    self.recursive_page_scrape(link)
Example #10
0
def get_json_ld_headline(url):

	headline = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return headline, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return headline, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) #
	except Exception as e:
		error_code = '500'
		return headline, error_code

	jl = data['json-ld']

	for l in jl:
		if l.get('headline',None):
			headline = l['headline']
			headline = (headline[:197] + '...') if len(headline) > 197 else headline


	return headline, '200'
Example #11
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        def clean_text(text):
            return replace_escape_chars(
                remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ''
            try:
                clean_url = urljoin(
                    base_url,
                    replace_entities(clean_link(
                        url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url,
                                    response_encoding)

        links_text = linkre.findall(response_text)
        return [
            Link(clean_url(url).encode(response_encoding), clean_text(text))
            for url, _, text in links_text
        ]
Example #12
0
def get_base_url(response):
    """Return the base url of the given response, joined with the response url"""
    if response not in _baseurl_cache:
        text = response.body_as_unicode()[0:4096]
        _baseurl_cache[response] = html.get_base_url(text, response.url, \
            response.encoding)
    return _baseurl_cache[response]
def transformed_response_body(
        response: Response,
        html_transform: Callable[[BeautifulSoup, str, ProxyUrl], None],
        proxy_url: ProxyUrl) -> Tuple[bool, bytes]:

    body = response.body or b''
    content_type = (response.headers or {}).get('content-type', '')
    if content_type.startswith('text/html'):
        encoding = http_content_type_encoding(content_type)
        try:
            base_url = get_base_url(body, response.url, encoding)
        except UnicodeDecodeError:
            base_url = response.url
        soup = BeautifulSoup(body, 'lxml', from_encoding=encoding)
        html_transform(
            soup, base_url=base_url, proxy_url=proxy_url)
        head = soup.find('head')
        if head:
            head.append(soup.new_tag('meta', charset='utf8'))
        return True, soup.encode('utf8')
    elif content_type.startswith('text/css'):
        css_source = body.decode('utf8', 'ignore')
        return (False, process_css(
            css_source, base_uri=response.url, proxy_url=proxy_url)
                .encode('utf8'))
    else:
        return False, body
Example #14
0
 def extract_urls(self, html: str, url: str) -> List[Tuple[float, str]]:
     """
     Extract all URLs from html, return a list of (score, url) pairs.
     """
     sel = parsel.Selector(html)
     base_url = get_base_url(html[:4096], url)
     return self._extract_urls(html, url, sel, base_url)
Example #15
0
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
    base_url = get_base_url(html, url)
    data = extruct.extract(html, base_url=base_url)
    try:
        properties = data["opengraph"][0]["properties"]
    except:
        return

    return {
        "name": og_field(properties, "og:title"),
        "description": og_field(properties, "og:description"),
        "image": og_field(properties, "og:image"),
        "recipeYield": "",
        # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
        "recipeIngredient": ["Could not detect ingredients"],
        # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
        "recipeInstructions": [{
            "text": "Could not detect instructions"
        }],
        "slug": slugify(og_field(properties, "og:title")),
        "orgURL": og_field(properties, "og:url"),
        "categories": [],
        "tags": og_fields(properties, "og:article:tag"),
        "dateAdded": None,
        "notes": [],
        "extras": [],
    }
Example #16
0
    def __init__(self, url, test=False):
        self.format = None
        self.testing_mode = test
        self.data = {}
        self.format = None

        if test:  # when testing, we load a file
            with url:
                r = url.read()
                data = extruct.extract(
                    r,
                    base_url=
                    "https://www.allrecipes.com/recipe/133948/four-cheese-margherita-pizza/",
                    syntaxes=SYNTAXES,
                    uniform=True,
                )
        else:
            r = requests.get(url, headers=HEADERS)
            data = extruct.extract(
                r.text,
                base_url=get_base_url(r.text, r.url),
                syntaxes=SYNTAXES,
                uniform=True,
            )

        for syntax in SYNTAXES:
            for item in data.get(syntax, []):
                if ("@context" in item and item["@context"] == SCHEMA_ORG_HOST
                        and "@type" in item
                        and item["@type"].lower() == SCHEMA_NAME.lower()):
                    self.format = syntax
                    self.data = item
                    return
Example #17
0
  def perform(kls, inputs):
    urls = inputs['target:url']
    for url in urls:
      try:
        r = requests.get(url)
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld']
        tree = Tree(data)
        break
      except:
        data = None

    return dict(
      **{
        'metric:30': {
          'answer': 1.0 if data else 0.0,
          'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed',
        },
      },
      **{
        key: {
          'answer': 1.0 if attr else 0.0,
          'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])),
        } if key.startswith('metric:') else attr
        for key, attr in zip(
          to_schema.keys(),
          map(
            bind(get_json_ld_attr, tree),
            to_schema.values()
          )
        )
      } if data else {key: {} for key in to_schema.keys()}
    )
Example #18
0
    def get_base_url(self):
        """Return the base url of the given response, joined with the response url"""
        if self._response not in self._baseurl_cache:
            text = self._response.text[0:4096]
            self._baseurl_cache[self] = html.get_base_url(
                text, self._response.url, self._response.encoding)

        return self._baseurl_cache[self]
Example #19
0
def get_base_url(response):
    """Return the base url of the given response, joined with the response url"""
    if response not in _baseurl_cache:
        text = response.text[0:4096]
        text = html.remove_comments(text, response.encoding)
        _baseurl_cache[response] = html.get_base_url(text, response.url,
                                                     response.encoding)
    return _baseurl_cache[response]
def main():
    pp = pprint.PrettyPrinter(indent=2)
    url = 'https://www.imdb.com/title/tt4574334/?ref_=fn_al_tt_1'
    r = requests.get(url)
    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)

    pp.pprint(data)
Example #21
0
def get_metadata(html, url):
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(
        html,
        base_url=get_base_url(html, url),
        syntaxes=['json-ld'],
    )['json-ld'][0]
    return metadata
Example #22
0
 def test_no_scheme_url(self):
     baseurl = "https://example.org"
     text = b"""\
         <html>\
         <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\
         <body>blahablsdfsal&amp;</body>\
         </html>"""
     self.assertEqual(get_base_url(text, baseurl), "https://noscheme.com/path")
Example #23
0
 def test_no_scheme_url(self):
     baseurl = 'https://example.org'
     text = b"""\
         <html>\
         <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\
         <body>blahablsdfsal&amp;</body>\
         </html>"""
     self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
Example #24
0
 def test_relative_url_with_absolute_path(self):
     baseurl = 'https://example.org'
     text = u"""\
         <html>\
         <head><title>Dummy</title><base href='/absolutepath' /></head>\
         <body>blahablsdfsal&amp;</body>\
         </html>"""
     self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')
Example #25
0
 def test_relative_url_with_absolute_path(self):
     baseurl = 'https://example.org'
     text = u"""\
         <html>\
         <head><title>Dummy</title><base href='/absolutepath' /></head>\
         <body>blahablsdfsal&amp;</body>\
         </html>"""
     self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')
Example #26
0
    def test_attributes_before_href(self):
        baseurl = u'https://example.org'

        text = u"""\
            <html>\
            <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
Example #27
0
    def test_tag_name(self):
        baseurl = 'https://example.org'

        text = """\
            <html>\
            <head><title>Dummy</title><basefoo href='http://example.org/something' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), 'https://example.org')
def get_metadata(html: str, url: str):
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(html,
                               base_url=get_base_url(url),
                               syntaxes=['json-ld'],
                               uniform=True)['json-ld']
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]
    return metadata
Example #29
0
    def test_attributes_before_href(self):
        baseurl = "https://example.org"

        text = """\
            <html>\
            <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\
            <body>blahablsdfsal&amp;</body>\
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), "http://example.org/something")
Example #30
0
    def test_get_base_url_utf8(self):
        baseurl = u'https://example.org'

        text = u"""
            <html>
            <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl),
                         'http://example.org/snowman%E2%8D%A8')
Example #31
0
 def _check_external_identifier(self, url):
     session = requests.Session()
     session.headers.update({'User-Agent': self.config.get('user_agent')})
     r = session.get(url, timeout=30)
     base_url = get_base_url(r.text, r.url)
     data = extruct.extract(r.text, base_url=base_url, uniform=True)
     return {
         'data': self.schemaorg_normalizer.normalize_from_extruct(data),
         'timestamp': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
     }
Example #32
0
    def test_get_base_url_latin1_percent(self):
        # non-UTF-8 percent-encoded characters sequence are left untouched
        baseurl = "https://example.org"

        text = """
            <html>
            <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl), "http://example.org/sterling%a3")
def get_jsons(url):
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
    r = requests.get(url, headers=headers)
    if not r.ok:
        return []
    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)
    if 'json-ld' not in data:
        raise Exception('No se encuentran datos json-ld')
    return data['json-ld']
Example #34
0
    def test_get_base_url_utf8(self):
        baseurl = 'https://example.org'

        text = """
            <html>
            <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl),
                         'http://example.org/snowman%E2%8D%A8')
Example #35
0
def extractJSON(url, write_data=False):
    r = requests.get(url)
    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)["json-ld"]
    if len(data) > 0:
        if write_data:
            with open('last_data.json', 'w') as outfile:
                json.dump(data[0], outfile)
        return data[0]
    return []
Example #36
0
    def test_get_base_url_latin1_percent(self):
        # non-UTF-8 percent-encoded characters sequence are left untouched
        baseurl = u'https://example.org'

        text = u"""
            <html>
            <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl),
                         'http://example.org/sterling%a3')
Example #37
0
    def test_get_base_url_latin1(self):
        # page encoding does not affect URL path encoding before percent-escaping
        # we should still use UTF-8 by default
        baseurl = 'https://example.org'

        text = """
            <html>
            <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
                         'http://example.org/sterling%C2%A3')
Example #38
0
def get_metadata(url):
    """Fetch JSON-LD structured data."""
    reqs = requests.get(url)
    html = reqs.text
    metadata = extruct.extract(html,
                               base_url=get_base_url(url),
                               syntaxes=['json-ld'],
                               uniform=True)['json-ld']
    if bool(metadata) and isinstance(metadata, list):
        metadata = metadata[0]

    return metadata
Example #39
0
    def test_get_base_url_latin1(self):
        # page encoding does not affect URL path encoding before percent-escaping
        # we should still use UTF-8 by default
        baseurl = u'https://example.org'

        text = u"""
            <html>
            <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
                         'http://example.org/sterling%C2%A3')
Example #40
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        def clean_text(text):
            return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

        def clean_url(url):
            clean_url = ""
            try:
                clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
            except ValueError:
                pass
            return clean_url

        if base_url is None:
            base_url = get_base_url(response_text, response_url, response_encoding)

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
Example #41
0
def parse_next_url(pageURL):
    response = urllib2.urlopen(pageURL)
    result = response.read()
    tree = etree.HTML(result)
    #nodes = tree.xpath('//table/tr/td/table[@class="data"]')
    tables = tree.xpath('//table/tr/td/table[@class="data"]')
    print len(tables)
    itemDict = {}
    for j in range(len(tables)):
        #titles = tables[j].xpath('.//tr/th/text()').extract()
        titles = tables[j].xpath('.//tr/th/text()')
        print titles
        values = tables[j].xpath('.//tr/td')
        if titles and values:
            print len(titles),len(values)
            #raw_input()
            for i in range(1,len(titles)):
                finalTitle = titles[i].replace(u'\xa0',' ').encode('utf8').strip()
                value = values[i].xpath('.//text()')[0].encode('utf8').strip()
                if finalTitle == 'Associated with diseases' and value == '-':
                    #return None
                    pass
            for i in range(1,len(titles)):
                valueList = []
                if values[i].xpath('.//a/text()'):
                    value = values[i].xpath('.//a/text()')
                    url = values[i].xpath('.//a/@href')
                    print url
                    for l in range(len(value)):
                        finalValue = value[l].encode('utf8').strip()
                        finalUrl = url[l].encode('utf8').strip()
                        if 'http' not in finalUrl:
                            finalUrl = urljoin_rfc(html.get_base_url(result, baseurl=''),finalUrl)
                        valueList.append((finalValue,finalUrl))
                else:
                    value = values[i-1].xpath('.//text()')
                    for l in range(len(value)):
                        finalValue = value[l].encode('utf8').strip()
                        valueList.append((finalValue,''))
                finalTitle = titles[i].replace(u'\xa0',' ').encode('utf8').strip()
                #print finalTitle,":", valueList
                itemDict[finalTitle] = valueList
    #print itemDict
    raw_input(itemDict)
Example #42
0
    def _dochits_to_objset(self, docHits):
        '''Returns list of objects.
        '''

        objset = []
        for d in docHits:
            r = requests.get(d.text)

            # get JSON-LD from the page
            base_url = get_base_url(r.text)
            data = extruct.extract(r.text, base_url)
            jsld = data.get('json-ld')[0]

            obj = {}
            obj_mdata = defaultdict(list)
            for mdata in jsld:
                obj_mdata[mdata] = jsld[mdata]
            obj['metadata'] = dict(obj_mdata)
            objset.append(obj)
            self.docs_fetched += 1
        return objset