def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True): resp = requests.get(url, timeout=30) result = { 'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason) } try: resp.raise_for_status() except requests.exceptions.HTTPError: return result parser = XmlDomHTMLParser(encoding=resp.encoding) tree = lxml.html.fromstring(resp.content, parser=parser) if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(tree, resp.url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(tree, resp.url) if rdfa: rdfae = RDFaExtractor() result['rdfa'] = rdfae.extract_items(tree, resp.url) return result
def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', []))
def test_expanded_opengraph_support(self): body = get_testdata('misc','expanded_OG_support_test.html') expected = json.loads( get_testdata('misc','expanded_OG_support_test.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data,expected)
def test_wikipedia_xhtml_rdfa_no_prefix(self): body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html') expected = json.loads( get_testdata('misc', 'Portfolio_Niels_Lubberman.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/') self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self): for i in [14]: fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self): for i in [14]: fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self): fileprefix = 'xhtml+rdfa' body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('wikipedia', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self): fileprefix = 'xhtml+rdfa' body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('wikipedia', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"): domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) return { name: extractor.extract_items(tree, url=url) for name, extractor in (('json-ld', JsonLdExtractor()), ('microdata', MicrodataExtractor()), ('rdfa', RDFaExtractor())) }
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected) # This is for testing that the fix to issue 116 does not affect # severely rdfa output even in a presence of a bug in the code def mocked_fix_order(x, y, z): raise Exception() rdfae._fix_order = mocked_fix_order data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def get_rdfa_from_warc(warc_file_no, path): global iteration_count global report_at_every_nth_step rdfaFileID = 1 htmlURL = '' data = '' append = False rdfaExtractor = RDFaExtractor() if not os.path.exists('RDFa Files\\WARC_{0}'.format(warc_file_no)): os.makedirs('RDFa Files\\WARC_{0}'.format(warc_file_no)) if not os.path.exists('XML Files\\WARC_{0}'.format(warc_file_no)): os.makedirs('XML Files\\WARC_{0}'.format(warc_file_no)) print('[INFO/PROGRESS] The file being processed: {0}'.format(path)) with open(path, encoding='utf-8', errors='replace') as file: for line in file: if debug and iteration_count % report_at_every_nth_step == 0: print("[DEBUG/PROGRESS] Processing line #{0:n}".format( iteration_count)) if 'WARC/1.0' in line and append: append = False try: rdfaData = rdfaExtractor.extract(data, base_url=htmlURL) if rdfaData != []: with open('RDFa Files\\WARC_{0}\\RDFa_{1}.txt'.format( warc_file_no, rdfaFileID), 'w', encoding='utf-8') as f: f.write('URL: {0}\n\n'.format(htmlURL)) f.write(str(rdfaData)) f.close() ConvertToXML.convertInstant( str(rdfaData), "XML Files\\WARC_{0}\\RDFa_{1}.xml".format( warc_file_no, rdfaFileID)) if debug: print( "[DEBUG/PROGRESS] Processed file #{0} at URI {1} successfully" .format(rdfaFileID, htmlURL)) except json.decoder.JSONDecodeError as jde: print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This JSON-LD may be invalid.' .format(rdfaFileID, str(jde))) except lxml.etree.ParserError as pe: print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may not have a valid RDFa representation.' .format(rdfaFileID, str(pe))) except Exception as exc: if str(exc).startswith('Can\'t split'): print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may be containing invalid XML namespaces.' .format(rdfaFileID, str(exc))) else: print( '[ERROR] An error has occurred while processing current file (#{0}): {1}' .format(rdfaFileID, str(exc))) finally: rdfaFileID += 1 data = '' htmlURL = '' if 'WARC-Target-URI:' in line: htmlURL = line.replace('WARC-Target-URI: ', '').replace('\r', '').replace('\n', '') if '<!DOCTYPE html' in line or '<!doctype html' in line or '<html' in line: append = True if append: data = data + line + '\n' iteration_count = iteration_count + 1 return iteration_count
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, schema_context='http://schema.org', **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor().extract_items, tree)) if 'json-ld' in syntaxes: processors.append(('json-ld', JsonLdExtractor().extract_items, tree)) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(('rdfa', RDFaExtractor().extract_items, tree)) output = {} for label, extract, document in processors: try: output[label] = list(extract(document, base_url=base_url)) except Exception: if errors == 'log': logger.exception('Failed to extract {}'.format(label)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: if 'microdata' in syntaxes: output['microdata'] = _umicrodata_microformat( output['microdata'], schema_context=schema_context) if 'microformat' in syntaxes: output['microformat'] = _umicrodata_microformat( output['microformat'], schema_context='http://microformats.org/wiki/') if 'opengraph' in syntaxes: output['opengraph'] = _uopengraph(output['opengraph']) return output
def extract_all_rdfa(response): rdfa_extractor = RDFaExtractor() return rdfa_extractor.extract(response.text, url=response.url)
} for s in subset[0].get('article:section') or []] json_ld_as_map = { '@context': 'http://schema.org', '@id': url, 'author': authors, 'section': sections } return json.dumps(json_ld_as_map) def load_json_ld(tx, json_ld_data): cypher_neosemantics = " CALL semantics.importRDFSnippet($payload,'JSON-LD');" import_summary = tx.run(cypher_neosemantics, payload=json_ld_data) print(import_summary) uri = "bolt://localhost:7687" driver = GraphDatabase.driver(uri, auth=("neo4j", "neo")) rss_entries_as_json_ld, entry_url_list = get_rss( 'https://www.theguardian.com/uk/rss') with driver.session() as session: session.write_transaction(load_json_ld, rss_entries_as_json_ld) rdfa_ext = RDFaExtractor() for url in entry_url_list: session.write_transaction(load_json_ld, get_article_additional_details(url)) driver.close()
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, return_html_node=False, schema_context='http://schema.org', with_og_array=False, **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } return_html_node: if True, it includes into the result a HTML node of respective embedded metadata under 'htmlNode' key. The feature is supported only by microdata syntax. Each node is of `lxml.etree.Element` type. schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning, stacklevel=2) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') try: tree = parse_xmldom_html(htmlstring, encoding=encoding) except Exception as e: if errors == 'ignore': return {} if errors == 'log': logger.exception('Failed to parse html, raises {}'.format(e)) return {} if errors == 'strict': raise processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree)) if 'json-ld' in syntaxes: processors.append(( 'json-ld', JsonLdExtractor().extract_items, tree, )) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(( 'rdfa', RDFaExtractor().extract_items, tree, )) if 'dublincore' in syntaxes: processors.append(( 'dublincore', DublinCoreExtractor().extract_items, tree, )) output = {} for syntax, extract, document in processors: try: output[syntax] = list(extract(document, base_url=base_url)) except Exception as e: if errors == 'log': logger.exception('Failed to extract {}, raises {}'.format( syntax, e)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: uniform_processors = [] if 'microdata' in syntaxes: uniform_processors.append(( 'microdata', _umicrodata_microformat, output['microdata'], schema_context, )) if 'microformat' in syntaxes: uniform_processors.append(( 'microformat', _umicrodata_microformat, output['microformat'], 'http://microformats.org/wiki/', )) if 'opengraph' in syntaxes: uniform_processors.append(( 'opengraph', _uopengraph, output['opengraph'], None, )) if 'dublincore' in syntaxes: uniform_processors.append(( 'dublincore', _udublincore, output['dublincore'], None, )) for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': output[syntax] = uniform(raw, with_og_array=with_og_array) elif syntax == 'dublincore': output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) except Exception as e: if errors == 'ignore': output[syntax] = [] if errors == 'log': output[syntax] = [] logger.exception( 'Failed to uniform extracted for {}, raises {}'.format( syntax, e)) if errors == 'strict': raise return output
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd