def test_get_base_url(self): baseurl = 'https://example.org' text = u"""\ <html>\ <head><title>Dummy</title><base href='http://example.org/something' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something') # relative url with absolute path text = u"""\ <html>\ <head><title>Dummy</title><base href='/absolutepath' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath') # no scheme url text = """\ <html>\ <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
def test_get_base_url(self): baseurl = u'https://example.org' text = u"""\ <html>\ <head><title>Dummy</title><base href='http://example.org/something' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something') self.assertEqual(get_base_url(text, baseurl.encode('ascii')), 'http://example.org/something')
def extraer_informacion_web(): pp = pprint.PrettyPrinter(indent=2) r = requests.get( 'https://www.casadellibro.com/libro-los-renglones-torcidos-de-hollywood/9788412094749/11187413' ) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) schema = data['json-ld'] soup = BeautifulSoup(r.text, 'lxml') imagen = soup.find('img', {'class': 'product-image'}) descripcion = soup.find('div', {'class': 'hidden-sm-and-down'}) desc = descripcion.text desc = desc.replace("Ver más", '').strip() if "CRÍTICAS" in desc: desc = desc.split("CRÍTICAS") desc = desc[0] if "Resumen" in desc: desc = desc.split("Resumen") desc = desc[1] obj = {} obj['desc'] = desc obj['imagen'] = imagen['data-src'] obj['schema'] = schema print(obj) print(type(obj))
def get_metadata(html: bytes, url: str): """Fetch JSON-LD structured data.""" metadata = extruct.extract(html, base_url=get_base_url(url), syntaxes=['json-ld'], uniform=True)['json-ld'] return metadata
def get_jsons(url): r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) if 'json-ld' not in data: raise Exception('No se encuentran datos json-ld') return data['json-ld']
def get_json_ld_description(url): description = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return description, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return description, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) # except Exception as e: error_code = '500' return description, error_code jl = data['json-ld'] for l in jl: if l.get('description',None): description = l['description'] return description, '200'
def get_opengraph_description(url): description = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return description, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return description, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['opengraph']) # except Exception as e: error_code = '500' return description, error_code og = data['opengraph'] for o in og: if o.get('properties',None): for i in o['properties']: if i[0] == 'og:description': description = i[1] return description, '200'
def get_base_url(response): """Return the base url of the given response, joined with the response url""" if response not in _baseurl_cache: text = response.body_as_unicode()[0:4096] _baseurl_cache[response] = html.get_base_url(text, response.url, \ response.encoding) return _baseurl_cache[response]
def recursive_page_scrape(self, page): """ Crawls across website and scrapes recipes as discovered. The navigation is recursive, scraping each page it comes upon and all the links on that page. """ # Mark as being read print_message = f"Recipes Found: {len(self.scraped_recipes)} | Scraping {page}" print(print_message.ljust(200, " "), end="\r", flush=True) self.link_library[page] = 1 self.sdb.update_one({"_id": self.source}, {"$set": { "link_library": self.link_library }}) # Scrape it, if it errors out it won't be tried again r = requests.get(page, headers=self.headers) soup = BeautifulSoup(r.content, "html.parser") base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) # Recipe self.scrape_recipe(data, page) # Look for all links on page for link_string in soup.findAll( 'a', attrs={'href': re.compile("^https://")}): link = clean_url(link_string.get('href'), self.utm_pages) if check_link(link, self.domain): if link not in self.link_library.keys(): wait() self.recursive_page_scrape(link)
def get_json_ld_headline(url): headline = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return headline, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return headline, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) # except Exception as e: error_code = '500' return headline, error_code jl = data['json-ld'] for l in jl: if l.get('headline',None): headline = l['headline'] headline = (headline[:197] + '...') if len(headline) > 197 else headline return headline, '200'
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): def clean_text(text): return replace_escape_chars( remove_tags(text.decode(response_encoding))).strip() def clean_url(url): clean_url = '' try: clean_url = urljoin( base_url, replace_entities(clean_link( url.decode(response_encoding)))) except ValueError: pass return clean_url if base_url is None: base_url = get_base_url(response_text, response_url, response_encoding) links_text = linkre.findall(response_text) return [ Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text ]
def transformed_response_body( response: Response, html_transform: Callable[[BeautifulSoup, str, ProxyUrl], None], proxy_url: ProxyUrl) -> Tuple[bool, bytes]: body = response.body or b'' content_type = (response.headers or {}).get('content-type', '') if content_type.startswith('text/html'): encoding = http_content_type_encoding(content_type) try: base_url = get_base_url(body, response.url, encoding) except UnicodeDecodeError: base_url = response.url soup = BeautifulSoup(body, 'lxml', from_encoding=encoding) html_transform( soup, base_url=base_url, proxy_url=proxy_url) head = soup.find('head') if head: head.append(soup.new_tag('meta', charset='utf8')) return True, soup.encode('utf8') elif content_type.startswith('text/css'): css_source = body.decode('utf8', 'ignore') return (False, process_css( css_source, base_uri=response.url, proxy_url=proxy_url) .encode('utf8')) else: return False, body
def extract_urls(self, html: str, url: str) -> List[Tuple[float, str]]: """ Extract all URLs from html, return a list of (score, url) pairs. """ sel = parsel.Selector(html) base_url = get_base_url(html[:4096], url) return self._extract_urls(html, url, sel, base_url)
def basic_recipe_from_opengraph(html: str, url: str) -> dict: base_url = get_base_url(html, url) data = extruct.extract(html, base_url=base_url) try: properties = data["opengraph"][0]["properties"] except: return return { "name": og_field(properties, "og:title"), "description": og_field(properties, "og:description"), "image": og_field(properties, "og:image"), "recipeYield": "", # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails. "recipeIngredient": ["Could not detect ingredients"], # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity. "recipeInstructions": [{ "text": "Could not detect instructions" }], "slug": slugify(og_field(properties, "og:title")), "orgURL": og_field(properties, "og:url"), "categories": [], "tags": og_fields(properties, "og:article:tag"), "dateAdded": None, "notes": [], "extras": [], }
def __init__(self, url, test=False): self.format = None self.testing_mode = test self.data = {} self.format = None if test: # when testing, we load a file with url: r = url.read() data = extruct.extract( r, base_url= "https://www.allrecipes.com/recipe/133948/four-cheese-margherita-pizza/", syntaxes=SYNTAXES, uniform=True, ) else: r = requests.get(url, headers=HEADERS) data = extruct.extract( r.text, base_url=get_base_url(r.text, r.url), syntaxes=SYNTAXES, uniform=True, ) for syntax in SYNTAXES: for item in data.get(syntax, []): if ("@context" in item and item["@context"] == SCHEMA_ORG_HOST and "@type" in item and item["@type"].lower() == SCHEMA_NAME.lower()): self.format = syntax self.data = item return
def perform(kls, inputs): urls = inputs['target:url'] for url in urls: try: r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld'] tree = Tree(data) break except: data = None return dict( **{ 'metric:30': { 'answer': 1.0 if data else 0.0, 'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed', }, }, **{ key: { 'answer': 1.0 if attr else 0.0, 'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])), } if key.startswith('metric:') else attr for key, attr in zip( to_schema.keys(), map( bind(get_json_ld_attr, tree), to_schema.values() ) ) } if data else {key: {} for key in to_schema.keys()} )
def get_base_url(self): """Return the base url of the given response, joined with the response url""" if self._response not in self._baseurl_cache: text = self._response.text[0:4096] self._baseurl_cache[self] = html.get_base_url( text, self._response.url, self._response.encoding) return self._baseurl_cache[self]
def get_base_url(response): """Return the base url of the given response, joined with the response url""" if response not in _baseurl_cache: text = response.text[0:4096] text = html.remove_comments(text, response.encoding) _baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding) return _baseurl_cache[response]
def main(): pp = pprint.PrettyPrinter(indent=2) url = 'https://www.imdb.com/title/tt4574334/?ref_=fn_al_tt_1' r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) pp.pprint(data)
def get_metadata(html, url): """Fetch JSON-LD structured data.""" metadata = extruct.extract( html, base_url=get_base_url(html, url), syntaxes=['json-ld'], )['json-ld'][0] return metadata
def test_no_scheme_url(self): baseurl = "https://example.org" text = b"""\ <html>\ <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), "https://noscheme.com/path")
def test_no_scheme_url(self): baseurl = 'https://example.org' text = b"""\ <html>\ <head><title>Dummy</title><base href='//noscheme.com/path' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'https://noscheme.com/path')
def test_relative_url_with_absolute_path(self): baseurl = 'https://example.org' text = u"""\ <html>\ <head><title>Dummy</title><base href='/absolutepath' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'https://example.org/absolutepath')
def test_attributes_before_href(self): baseurl = u'https://example.org' text = u"""\ <html>\ <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something')
def test_tag_name(self): baseurl = 'https://example.org' text = """\ <html>\ <head><title>Dummy</title><basefoo href='http://example.org/something' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), 'https://example.org')
def get_metadata(html: str, url: str): """Fetch JSON-LD structured data.""" metadata = extruct.extract(html, base_url=get_base_url(url), syntaxes=['json-ld'], uniform=True)['json-ld'] if bool(metadata) and isinstance(metadata, list): metadata = metadata[0] return metadata
def test_attributes_before_href(self): baseurl = "https://example.org" text = """\ <html>\ <head><title>Dummy</title><base id='my_base_tag' href='http://example.org/something' /></head>\ <body>blahablsdfsal&</body>\ </html>""" self.assertEqual(get_base_url(text, baseurl), "http://example.org/something")
def test_get_base_url_utf8(self): baseurl = u'https://example.org' text = u""" <html> <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/snowman%E2%8D%A8')
def _check_external_identifier(self, url): session = requests.Session() session.headers.update({'User-Agent': self.config.get('user_agent')}) r = session.get(url, timeout=30) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url, uniform=True) return { 'data': self.schemaorg_normalizer.normalize_from_extruct(data), 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) }
def test_get_base_url_latin1_percent(self): # non-UTF-8 percent-encoded characters sequence are left untouched baseurl = "https://example.org" text = """ <html> <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl), "http://example.org/sterling%a3")
def get_jsons(url): headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} r = requests.get(url, headers=headers) if not r.ok: return [] base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) if 'json-ld' not in data: raise Exception('No se encuentran datos json-ld') return data['json-ld']
def test_get_base_url_utf8(self): baseurl = 'https://example.org' text = """ <html> <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/snowman%E2%8D%A8')
def extractJSON(url, write_data=False): r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url)["json-ld"] if len(data) > 0: if write_data: with open('last_data.json', 'w') as outfile: json.dump(data[0], outfile) return data[0] return []
def test_get_base_url_latin1_percent(self): # non-UTF-8 percent-encoded characters sequence are left untouched baseurl = u'https://example.org' text = u""" <html> <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl), 'http://example.org/sterling%a3')
def test_get_base_url_latin1(self): # page encoding does not affect URL path encoding before percent-escaping # we should still use UTF-8 by default baseurl = 'https://example.org' text = """ <html> <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'), 'http://example.org/sterling%C2%A3')
def get_metadata(url): """Fetch JSON-LD structured data.""" reqs = requests.get(url) html = reqs.text metadata = extruct.extract(html, base_url=get_base_url(url), syntaxes=['json-ld'], uniform=True)['json-ld'] if bool(metadata) and isinstance(metadata, list): metadata = metadata[0] return metadata
def test_get_base_url_latin1(self): # page encoding does not affect URL path encoding before percent-escaping # we should still use UTF-8 by default baseurl = u'https://example.org' text = u""" <html> <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'), 'http://example.org/sterling%C2%A3')
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): def clean_text(text): return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip() def clean_url(url): clean_url = "" try: clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding)))) except ValueError: pass return clean_url if base_url is None: base_url = get_base_url(response_text, response_url, response_encoding) links_text = linkre.findall(response_text) return [Link(clean_url(url).encode(response_encoding), clean_text(text)) for url, _, text in links_text]
def parse_next_url(pageURL): response = urllib2.urlopen(pageURL) result = response.read() tree = etree.HTML(result) #nodes = tree.xpath('//table/tr/td/table[@class="data"]') tables = tree.xpath('//table/tr/td/table[@class="data"]') print len(tables) itemDict = {} for j in range(len(tables)): #titles = tables[j].xpath('.//tr/th/text()').extract() titles = tables[j].xpath('.//tr/th/text()') print titles values = tables[j].xpath('.//tr/td') if titles and values: print len(titles),len(values) #raw_input() for i in range(1,len(titles)): finalTitle = titles[i].replace(u'\xa0',' ').encode('utf8').strip() value = values[i].xpath('.//text()')[0].encode('utf8').strip() if finalTitle == 'Associated with diseases' and value == '-': #return None pass for i in range(1,len(titles)): valueList = [] if values[i].xpath('.//a/text()'): value = values[i].xpath('.//a/text()') url = values[i].xpath('.//a/@href') print url for l in range(len(value)): finalValue = value[l].encode('utf8').strip() finalUrl = url[l].encode('utf8').strip() if 'http' not in finalUrl: finalUrl = urljoin_rfc(html.get_base_url(result, baseurl=''),finalUrl) valueList.append((finalValue,finalUrl)) else: value = values[i-1].xpath('.//text()') for l in range(len(value)): finalValue = value[l].encode('utf8').strip() valueList.append((finalValue,'')) finalTitle = titles[i].replace(u'\xa0',' ').encode('utf8').strip() #print finalTitle,":", valueList itemDict[finalTitle] = valueList #print itemDict raw_input(itemDict)
def _dochits_to_objset(self, docHits): '''Returns list of objects. ''' objset = [] for d in docHits: r = requests.get(d.text) # get JSON-LD from the page base_url = get_base_url(r.text) data = extruct.extract(r.text, base_url) jsld = data.get('json-ld')[0] obj = {} obj_mdata = defaultdict(list) for mdata in jsld: obj_mdata[mdata] = jsld[mdata] obj['metadata'] = dict(obj_mdata) objset.append(obj) self.docs_fetched += 1 return objset