def test_schemaorg_CreativeWork(self): body = get_testdata('schema.org', 'CreativeWork.001.html') expected = json.loads(get_testdata('schema.org', 'CreativeWork.001.jsonld').decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True): resp = requests.get(url, timeout=30) result = { 'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason) } try: resp.raise_for_status() except requests.exceptions.HTTPError: return result parser = XmlDomHTMLParser(encoding=resp.encoding) tree = lxml.html.fromstring(resp.content, parser=parser) if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(tree, resp.url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(tree, resp.url) if rdfa: rdfae = RDFaExtractor() result['rdfa'] = rdfae.extract_items(tree, resp.url) return result
def extract_all_json_ld(html_text): try: jslde = JsonLdExtractor() data = jslde.extract(html_text) return data except: return {}
def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', []))
def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def test_songkick(self): page = "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015" body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_null(self): page = "null_ld_mock" body = get_testdata('misc', '{}.html'.format(page)) expected = json.loads( get_testdata('misc', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) # Fields to output for each tool my_keys = [ '@id', '@type', 'applicationCategory', 'name', 'alternateName', 'description', 'url', 'sameAs', 'image', 'genre', 'softwareVersion', 'softwareRequirements', 'operatingSystem', 'downloadUrl', 'installUrl' ] #oururl= urlopen(url).read() #print(oururl) extractor = JsonLdExtractor() #with urllib.request.urlopen(url) as response: #html_text = response.read().decode('utf-8') my_items = extractor.extract(response.body_as_unicode(), response.url) # If this is a -tool page only: #my_items = extractor.extract(html_text) this_item = my_items['items'][0]['@graph'][0] my_item = {} # output all basic items for this_key in my_keys: # print(this_key,'\t',this_item[this_key]) my_item[this_key] = this_item[this_key] # get license license_type = this_item['license']['@type'] license_text = self.removeNonAscii(this_item['license']['text']) # print('license_type','\t',license_type) # print('license_text','\t',license_text) my_item['license_type'] = license_type my_item['license_text'] = license_text # Get pmcrefcount of first only Entrez.email = "*****@*****.**" first_pub = this_item['publication'][0] pmcrefcount = 0 if 'pubmed' in first_pub['url']: this_pmid = first_pub['url'].split('/')[-1:] pmcrefcount = Entrez.read( Entrez.efetch(db="pubmed", id=this_pmid, rettype="docsum"))[0]['PmcRefCount'] # print('primary_pub','\t',first_pub['name']) # print('primary_pub_url','\t',first_pub['url']) # print('primary_pub_pmcrefcount','\t',pmcrefcount) my_item['primary_pub'] = first_pub['name'] my_item['primary_pub_url'] = first_pub['url'] my_item['primary_pub_pmcrefcount'] = pmcrefcount #return my_item yield my_item return my_item
def test_jsonld_with_comments(self): for prefix in ['JoinAction.001', 'AllocateAction.001', ]: body = get_testdata('schema.org.invalid', '{}.html'.format(prefix)) expected = json.loads(get_testdata('schema.org.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) # print('response.body:', response.body) # print('data:', data) if len(data) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data[0]['properties'] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = [ # ingredient for ingredient in recipe['ingredients'] # if ingredient not in ['', 'Add all ingredients to list'] # ] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = recipe['ingredients'] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] properties = [ 'totalTime', 'nutrition', 'name', 'author', 'url', 'image', 'recipeIngredient', 'aggregateRating', 'recipeYield', 'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime', 'recipeCategory', 'review', 'prepTime', 'description' ] recipe_output_item = RecipeItem() for prop in properties: try: recipe_output_item[prop] = recipe[prop] except KeyError: recipe_output_item[prop] = None yield recipe_output_item
def test_songkick(self): for page in [ "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015", #"Maxïmo Park Gigography, Tour History & Past Concerts", #"Years & Years Tickets, Tour Dates 2015 & Concerts", ]: body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def set_data(self): """ Extract JSON-LD data from self.page and store as a string in self.data. Return True if data is present, False if not. Raise RuntimeError if JSON-LD cannot be extracted (e.g. if it is malformed).""" jslde = JsonLdExtractor() try: self.data = dumps(jslde.extract(self.page, base_url=self.base_url)) if self.data == "[]": return False else: return True except: msg = "Error extracting data from page" raise RuntimeError(msg)
def test_songkick(self): for page in [ "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015", #"Maxïmo Park Gigography, Tour History & Past Concerts", #"Years & Years Tickets, Tour Dates 2015 & Concerts", ]: body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def extract_json_ld(html, typ_str): try: jslde = JsonLdExtractor() items = jslde.extract(html) for item in items: item_context = item.get('@context', '').rstrip(' /') if (item_context == 'http://schema.org' or item_context == 'https://schema.org') \ and item.get('@type', '') == typ_str: return item return None except: return None
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def parse_item(self, response): items = [] def microdata2jsonld(md): if md.get('properties'): item = md['properties'] item['@type'] = md.get('type') return item items += map(microdata2jsonld, MicrodataExtractor().extract( response.body_as_unicode(), response.url)['items']) items += JsonLdExtractor().extract( response.body_as_unicode(), response.url)['items'] if not items: self.logger.debug("No Microdata items found for %s", response.url) self.logger.debug("Checking URL for item: %s" , items) for item in items: if not item or not item.get('url'): self.logger.debug("No URL for item: %s" , item) continue if item['url'] != response.url: self.logger.debug("Not in main URL, go there..") yield Request(item['url'], callback=self.parse_item) else: item['@type'] = item.get('type') self.logger.debug("Parsed microdata: %s" % item) yield item
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"): domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) return { name: extractor.extract_items(tree, url=url) for name, extractor in (('json-ld', JsonLdExtractor()), ('microdata', MicrodataExtractor()), ('rdfa', RDFaExtractor())) }
def loadSOGraphFromHtml(html, url): """ Extract jsonld entries from provided HTML text Args: html(string): HTML text to be parsed Returns: ConjunctiveGraph: Graph loaded from html """ jslde = JsonLdExtractor() json_content = jslde.extract(html) g = ConjunctiveGraph() for json_data in json_content: g_data = loadSOGraph(data=json.dumps(json_data), publicID=url) g += g_data return g
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(lxmldoc, url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(lxmldoc) return result
def parse_scene(self, response): jslde = JsonLdExtractor() json = jslde.extract(response.text) data = {} for obj in json: if obj['@type'] == 'VideoObject': data = obj break item = SceneItem() item['title'] = self.cleanup_title(data['name']) item['description'] = self.cleanup_description(data['description']) item['image'] = data['thumbnail'] item['image_blob'] = None item['id'] = self.get_id(response) item['trailer'] = data['contentUrl'] item['url'] = response.url item['date'] = self.parse_date(data['datePublished']).isoformat() item['site'] = data['author']['name'] item['network'] = self.network item['parent'] = item['site'] item['performers'] = [] for model in data['actor']: item['performers'].append(model['name']) item['tags'] = self.get_tags(response) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [ tag['properties']['title'] for tag in data['items'][1:] ] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [tag['properties']['title'] for tag in data['items'][1:]] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def async_extruct(url, microdata=True, jsonld=True): resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {"url": url, "status": "ok"} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get("items", []): result["microdata"] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get("items", []): result["json-ld"] = jsonldata return result
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get('items', []): result['microdata'] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get('items', []): result['json-ld'] = jsonldata return result
def parse_job(self, response): # Create the loader using the response # E.G. : l.add_xpath('item', '*xpath*', re='*expression*') jslde = JsonLdExtractor() data = jslde.extract(response.text) data = data[0] l = ItemLoader(item=JobsjsonItem(), response=response) l.add_value('search_postcode', self.search_postcode) l.add_value('search_radius', self.search_radius) l.add_value('date_scraped', time.strftime("%Y-%m-%d %H:%M:%S")) l.add_value('date_posted', data['datePosted']) l.add_value('valid_until', data['validThrough']) l.add_value('job_id', response.url, re='\d{7}') l.add_value('job_title', data['title']) l.add_value('job_type', data['employmentType']) l.add_value('location', ",".join([ data['jobLocation']['address']['addressLocality'], data['jobLocation']['address']['addressRegion'], data['jobLocation']['address']['postalCode'], data['jobLocation']['address']['addressCountry']])) l.add_xpath('contact_name', '//table[@class="job-listing-table"]//tr[8]//td//text()') l.add_xpath('start_date', '//table[@class="job-listing-table"]//tr[6]//td//text()') try: l.add_value('salary_min', data['baseSalary']['value']['value']) except: l.add_value('salary_min', 'NA') l.add_value('listed_on', data['datePosted']) l.add_value('recruiter', data['hiringOrganization']['name']) l.add_value('recruiter_url', data['hiringOrganization']['sameAs']) try: l.add_value('job_reference', data['identifier']['value']) except: l.add_value('job_reference', 'NA') l.add_value('url', response.url) l.add_value('job_description', data['description']) l.add_value('job_skills', data['skills']) l.add_value('addressLocality', data['jobLocation']['address']['addressLocality']) l.add_value('addressRegion', data['jobLocation']['address']['addressRegion']) l.add_value('postalCode', data['jobLocation']['address']['postalCode']) l.add_value('addressCountry', data['jobLocation']['address']['addressCountry']) return l.load_item()
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def extract_jsonld(self, response, _id=None): """ Scrapy Spider Request Callback Function * Inject an _id field for database pipeline * Use response URL as default _id """ jslds = JsonLdExtractor().extract(response.body) for jsld in jslds: if _id: jsld['_id'] = _id else: jsld['_id'] = response.url logging.debug(jsld) yield jsld
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, return_html_node=False, schema_context='http://schema.org', with_og_array=False, **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } return_html_node: if True, it includes into the result a HTML node of respective embedded metadata under 'htmlNode' key. The feature is supported only by microdata syntax. Each node is of `lxml.etree.Element` type. schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning, stacklevel=2) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') try: tree = parse_xmldom_html(htmlstring, encoding=encoding) except Exception as e: if errors == 'ignore': return {} if errors == 'log': logger.exception('Failed to parse html, raises {}'.format(e)) return {} if errors == 'strict': raise processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree)) if 'json-ld' in syntaxes: processors.append(( 'json-ld', JsonLdExtractor().extract_items, tree, )) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(( 'rdfa', RDFaExtractor().extract_items, tree, )) if 'dublincore' in syntaxes: processors.append(( 'dublincore', DublinCoreExtractor().extract_items, tree, )) output = {} for syntax, extract, document in processors: try: output[syntax] = list(extract(document, base_url=base_url)) except Exception as e: if errors == 'log': logger.exception('Failed to extract {}, raises {}'.format( syntax, e)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: uniform_processors = [] if 'microdata' in syntaxes: uniform_processors.append(( 'microdata', _umicrodata_microformat, output['microdata'], schema_context, )) if 'microformat' in syntaxes: uniform_processors.append(( 'microformat', _umicrodata_microformat, output['microformat'], 'http://microformats.org/wiki/', )) if 'opengraph' in syntaxes: uniform_processors.append(( 'opengraph', _uopengraph, output['opengraph'], None, )) if 'dublincore' in syntaxes: uniform_processors.append(( 'dublincore', _udublincore, output['dublincore'], None, )) for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': output[syntax] = uniform(raw, with_og_array=with_og_array) elif syntax == 'dublincore': output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) except Exception as e: if errors == 'ignore': output[syntax] = [] if errors == 'log': output[syntax] = [] logger.exception( 'Failed to uniform extracted for {}, raises {}'.format( syntax, e)) if errors == 'strict': raise return output
def _extract_json_data(self, blob): html = blob.download_as_string().decode() jslde = JsonLdExtractor() data = jslde.extract(html) return json.dumps(strip_strings(data))
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, schema_context='http://schema.org', **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor().extract_items, tree)) if 'json-ld' in syntaxes: processors.append(('json-ld', JsonLdExtractor().extract_items, tree)) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(('rdfa', RDFaExtractor().extract_items, tree)) output = {} for label, extract, document in processors: try: output[label] = list(extract(document, base_url=base_url)) except Exception: if errors == 'log': logger.exception('Failed to extract {}'.format(label)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: if 'microdata' in syntaxes: output['microdata'] = _umicrodata_microformat( output['microdata'], schema_context=schema_context) if 'microformat' in syntaxes: output['microformat'] = _umicrodata_microformat( output['microformat'], schema_context='http://microformats.org/wiki/') if 'opengraph' in syntaxes: output['opengraph'] = _uopengraph(output['opengraph']) return output
def parse_scene(self, response): data = response.css('script:contains("dataLayer =")::text').get() data2 = response.xpath( "//script[contains(text(), 'ScenePlayerId = \"player\"')] | //script[contains(text(), 'ScenePlayerId = \"scenePlayer\"')] | //script[contains(text(), 'sceneReleaseDate')]" ).get() data3 = response.xpath( '//script[@type="application/ld+json"]/text()').get() if data3: data3 = json.loads(data3) data3 = data3[0] else: data3 = [] if len(chompjs.parse_js_object(data)): json_data = chompjs.parse_js_object(data)[0] jslde = JsonLdExtractor().extract(response.text) jsonlde = {} for obj in jslde: jsonlde.update(obj) item = SceneItem() if 'name' in jsonlde: item['title'] = jsonlde['name'] elif 'sceneDetails' in json_data and 'sceneTitle' in json_data[ 'sceneDetails']: item['title'] = json_data['sceneDetails']['sceneTitle'] else: item['title'] = self.get_title(response) if item['title']: if ", scene #01" in item['title'].lower(): item['title'] = item['title'].replace( ", Scene #01", "").replace(", scene #01", "") if 'sceneDetails' in json_data and 'sceneDescription' in json_data[ 'sceneDetails']: item['description'] = json_data['sceneDetails'][ 'sceneDescription'] elif 'description' in jsonlde: item['description'] = jsonlde['description'] else: item['description'] = self.get_description(response) if 'site' in response.meta: item['site'] = response.meta['site'] elif 'productionCompany' in data3: item['site'] = data3['productionCompany']['name'] elif 'siteName_pretty' in json_data: item['site'] = json_data['siteName_pretty'] elif 'siteName' in json_data: item['site'] = json_data['siteName'] if item['site']: item['site'] = match_site(item['site']) if not item['site']: item['site'] = self.get_site(response) if 'date' in response.meta: item['date'] = response.meta['date'] elif 'dateCreated' in jsonlde and 'nudefightclub' not in response.url and '0000-00-00' not in jsonlde[ 'dateCreated']: item['date'] = self.parse_date(jsonlde['dateCreated'], date_formats=['%Y-%m-%d' ]).isoformat() elif 'datePublished' in jsonlde and 'nudefightclub' not in response.url and '0000-00-00' not in jsonlde[ 'datePublished']: item['date'] = self.parse_date(jsonlde['datePublished'], date_formats=['%Y-%m-%d' ]).isoformat() elif 'nudefightclub' in response.url: date1 = response.xpath( '//div[@class="updatedDate"]/b/following-sibling::text()' ).get() item['date'] = self.parse_date(date1.strip()).isoformat() else: item['date'] = self.get_date(response) if not item['date']: item['date'] = self.get_date(response) if data2: date2 = re.search(r'sceneReleaseDate\":\"(\d{4}-\d{2}-\d{2})', data2) if date2: date2 = date2.group(1) date2 = self.parse_date(date2.strip(), date_formats=['%Y-%m-%d' ]).isoformat() if item['date'] and date2 > item['date']: item['date'] = date2 if 'image' in response.meta: item['image'] = response.meta['image'] else: item['image'] = self.get_image(response) item['image_blob'] = None if 'performers' in response.meta: item['performers'] = response.meta['performers'] elif 'actor' in jsonlde: item['performers'] = list( map(lambda x: x['name'].strip(), jsonlde['actor'])) else: item['performers'] = self.get_performers(response) if 'tags' in response.meta: item['tags'] = response.meta['tags'] elif 'keywords' in jsonlde: item['tags'] = jsonlde['keywords'].split(',') else: item['tags'] = self.get_tags(response) if item['tags']: item['tags'] = list( map(lambda x: string.capwords(x.strip()), item['tags'])) if 'id' in response.meta: item['id'] = response.meta['id'] else: item['id'] = self.get_id(response) if 'trailer' in response.meta: item['trailer'] = response.meta['trailer'] else: item['trailer'] = self.get_trailer(response) item['url'] = self.get_url(response) if hasattr(self, 'network'): item['network'] = self.network else: item['network'] = self.get_network(response) if hasattr(self, 'parent'): item['parent'] = self.parent else: item['parent'] = self.get_parent(response) if item['title']: item['title'] = self.cleanup_title(item['title']) if item['description']: item['description'] = self.cleanup_description( item['description']) if item['id'] and item['title']: days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item[ 'filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item else: super().parse_scene(response)
def getJsonLdFromHTML(html_text): """ Returns an array of json_ld structures found in the provided html_text """ jslde = JsonLdExtractor() return jslde.extract(html_text)
def _check_jsonld(self, body, expected): jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)