Exemple #1
0
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
    resp = requests.get(url, timeout=30)
    result = {
        'url': url,
        'status': '{} {}'.format(resp.status_code, resp.reason)
    }
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        return result

    parser = XmlDomHTMLParser(encoding=resp.encoding)
    tree = lxml.html.fromstring(resp.content, parser=parser)

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(tree, resp.url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(tree, resp.url)

    if rdfa:
        rdfae = RDFaExtractor()
        result['rdfa'] = rdfae.extract_items(tree, resp.url)

    return result
Exemple #2
0
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))
Exemple #3
0
    def test_expanded_opengraph_support(self):
        body = get_testdata('misc','expanded_OG_support_test.html')
        expected = json.loads(
                   get_testdata('misc','expanded_OG_support_test.json'
                   ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://www.example.com/index.html')

        self.assertJsonLDEqual(data,expected)
Exemple #4
0
    def test_wikipedia_xhtml_rdfa_no_prefix(self):
        body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html')
        expected = json.loads(
            get_testdata('misc',
                         'Portfolio_Niels_Lubberman.json').decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')

        self.assertJsonLDEqual(data, expected)
Exemple #5
0
    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.exaple.com/index.html')
            self.assertJsonLDEqual(data, expected)
Exemple #6
0
    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                get_testdata('w3crdfa',
                             fileprefix + '.expanded.json').decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
Exemple #7
0
    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8')
        expected = json.loads(
                get_testdata('wikipedia', fileprefix + '.expanded.json'
            ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, url='http://www.exaple.com/index.html')

        self.assertJsonLDEqual(data, expected)
Exemple #8
0
    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8')
        expected = json.loads(
            get_testdata('wikipedia',
                         fileprefix + '.expanded.json').decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, url='http://www.exaple.com/index.html')

        self.assertJsonLDEqual(data, expected)
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
Exemple #10
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)
Exemple #11
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                get_testdata('w3crdfa',
                             fileprefix + '.expanded.json').decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)
Exemple #12
0
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    return {
        name: extractor.extract_items(tree, url=url)
        for name, extractor in (('json-ld', JsonLdExtractor()),
                                ('microdata', MicrodataExtractor()),
                                ('rdfa', RDFaExtractor()))
    }
Exemple #13
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                       get_testdata('w3crdfa', fileprefix + '.expanded.json'
                       ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

            # This is for testing that the fix to issue 116 does not affect
            # severely rdfa output even in a presence of a bug in the code
            def mocked_fix_order(x, y, z):
                raise Exception()

            rdfae._fix_order = mocked_fix_order
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
Exemple #14
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res
Exemple #15
0
def get_rdfa_from_warc(warc_file_no, path):
    global iteration_count
    global report_at_every_nth_step

    rdfaFileID = 1
    htmlURL = ''
    data = ''
    append = False

    rdfaExtractor = RDFaExtractor()

    if not os.path.exists('RDFa Files\\WARC_{0}'.format(warc_file_no)):
        os.makedirs('RDFa Files\\WARC_{0}'.format(warc_file_no))

    if not os.path.exists('XML Files\\WARC_{0}'.format(warc_file_no)):
        os.makedirs('XML Files\\WARC_{0}'.format(warc_file_no))

    print('[INFO/PROGRESS] The file being processed: {0}'.format(path))

    with open(path, encoding='utf-8', errors='replace') as file:
        for line in file:
            if debug and iteration_count % report_at_every_nth_step == 0:
                print("[DEBUG/PROGRESS] Processing line #{0:n}".format(
                    iteration_count))

            if 'WARC/1.0' in line and append:
                append = False

                try:
                    rdfaData = rdfaExtractor.extract(data, base_url=htmlURL)

                    if rdfaData != []:
                        with open('RDFa Files\\WARC_{0}\\RDFa_{1}.txt'.format(
                                warc_file_no, rdfaFileID),
                                  'w',
                                  encoding='utf-8') as f:
                            f.write('URL: {0}\n\n'.format(htmlURL))
                            f.write(str(rdfaData))
                            f.close()

                            ConvertToXML.convertInstant(
                                str(rdfaData),
                                "XML Files\\WARC_{0}\\RDFa_{1}.xml".format(
                                    warc_file_no, rdfaFileID))
                            if debug:
                                print(
                                    "[DEBUG/PROGRESS] Processed file #{0} at URI {1} successfully"
                                    .format(rdfaFileID, htmlURL))
                except json.decoder.JSONDecodeError as jde:
                    print(
                        '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This JSON-LD may be invalid.'
                        .format(rdfaFileID, str(jde)))
                except lxml.etree.ParserError as pe:
                    print(
                        '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may not have a valid RDFa representation.'
                        .format(rdfaFileID, str(pe)))
                except Exception as exc:
                    if str(exc).startswith('Can\'t split'):
                        print(
                            '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may be containing invalid XML namespaces.'
                            .format(rdfaFileID, str(exc)))
                    else:
                        print(
                            '[ERROR] An error has occurred while processing current file (#{0}): {1}'
                            .format(rdfaFileID, str(exc)))
                finally:
                    rdfaFileID += 1
                    data = ''
                    htmlURL = ''

            if 'WARC-Target-URI:' in line:
                htmlURL = line.replace('WARC-Target-URI: ',
                                       '').replace('\r', '').replace('\n', '')

            if '<!DOCTYPE html' in line or '<!doctype html' in line or '<html' in line:
                append = True

            if append:
                data = data + line + '\n'

            iteration_count = iteration_count + 1

    return iteration_count
Exemple #16
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            schema_context='http://schema.org',
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata', MicrodataExtractor().extract_items, tree))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
    output = {}
    for label, extract, document in processors:
        try:
            output[label] = list(extract(document, base_url=base_url))
        except Exception:
            if errors == 'log':
                logger.exception('Failed to extract {}'.format(label))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise

    if uniform:
        if 'microdata' in syntaxes:
            output['microdata'] = _umicrodata_microformat(
                output['microdata'], schema_context=schema_context)
        if 'microformat' in syntaxes:
            output['microformat'] = _umicrodata_microformat(
                output['microformat'],
                schema_context='http://microformats.org/wiki/')
        if 'opengraph' in syntaxes:
            output['opengraph'] = _uopengraph(output['opengraph'])
    return output
def extract_all_rdfa(response):
    rdfa_extractor = RDFaExtractor()
    return rdfa_extractor.extract(response.text, url=response.url)
    } for s in subset[0].get('article:section') or []]
    json_ld_as_map = {
        '@context': 'http://schema.org',
        '@id': url,
        'author': authors,
        'section': sections
    }
    return json.dumps(json_ld_as_map)


def load_json_ld(tx, json_ld_data):
    cypher_neosemantics = " CALL semantics.importRDFSnippet($payload,'JSON-LD');"
    import_summary = tx.run(cypher_neosemantics, payload=json_ld_data)
    print(import_summary)


uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neo"))

rss_entries_as_json_ld, entry_url_list = get_rss(
    'https://www.theguardian.com/uk/rss')

with driver.session() as session:
    session.write_transaction(load_json_ld, rss_entries_as_json_ld)
    rdfa_ext = RDFaExtractor()
    for url in entry_url_list:
        session.write_transaction(load_json_ld,
                                  get_article_additional_details(url))

driver.close()
Exemple #19
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            return_html_node=False,
            schema_context='http://schema.org',
            with_og_array=False,
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       return_html_node: if True, it includes into the result a HTML node of
                         respective embedded metadata under 'htmlNode' key.
                         The feature is supported only by microdata syntax.
                         Each node is of `lxml.etree.Element` type.
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning,
                      stacklevel=2)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    try:
        tree = parse_xmldom_html(htmlstring, encoding=encoding)
    except Exception as e:
        if errors == 'ignore':
            return {}
        if errors == 'log':
            logger.exception('Failed to parse html, raises {}'.format(e))
            return {}
        if errors == 'strict':
            raise
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata',
             MicrodataExtractor(add_html_node=return_html_node).extract_items,
             tree))
    if 'json-ld' in syntaxes:
        processors.append((
            'json-ld',
            JsonLdExtractor().extract_items,
            tree,
        ))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append((
            'rdfa',
            RDFaExtractor().extract_items,
            tree,
        ))
    if 'dublincore' in syntaxes:
        processors.append((
            'dublincore',
            DublinCoreExtractor().extract_items,
            tree,
        ))
    output = {}
    for syntax, extract, document in processors:
        try:
            output[syntax] = list(extract(document, base_url=base_url))
        except Exception as e:
            if errors == 'log':
                logger.exception('Failed to extract {}, raises {}'.format(
                    syntax, e))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise
    if uniform:
        uniform_processors = []
        if 'microdata' in syntaxes:
            uniform_processors.append((
                'microdata',
                _umicrodata_microformat,
                output['microdata'],
                schema_context,
            ))
        if 'microformat' in syntaxes:
            uniform_processors.append((
                'microformat',
                _umicrodata_microformat,
                output['microformat'],
                'http://microformats.org/wiki/',
            ))
        if 'opengraph' in syntaxes:
            uniform_processors.append((
                'opengraph',
                _uopengraph,
                output['opengraph'],
                None,
            ))
        if 'dublincore' in syntaxes:
            uniform_processors.append((
                'dublincore',
                _udublincore,
                output['dublincore'],
                None,
            ))

        for syntax, uniform, raw, schema_context in uniform_processors:
            try:
                if syntax == 'opengraph':
                    output[syntax] = uniform(raw, with_og_array=with_og_array)
                elif syntax == 'dublincore':
                    output[syntax] = uniform(raw)
                else:
                    output[syntax] = uniform(raw, schema_context)
            except Exception as e:
                if errors == 'ignore':
                    output[syntax] = []
                if errors == 'log':
                    output[syntax] = []
                    logger.exception(
                        'Failed to uniform extracted for {}, raises {}'.format(
                            syntax, e))
                if errors == 'strict':
                    raise

    return output
Exemple #20
0
class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd