Ejemplo n.º 1
0
    def test_microformat(self):
        body = get_testdata('misc', 'microformat_test.html')
        expected = json.loads(
            get_testdata('misc', 'microformat_test.json').decode('UTF-8'))

        opengraphe = MicroformatExtractor()
        data = opengraphe.extract(body)
        self.assertEqual(jsonize_dict(data), expected)
Ejemplo n.º 2
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            schema_context='http://schema.org',
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata', MicrodataExtractor().extract_items, tree))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
    output = {}
    for label, extract, document in processors:
        try:
            output[label] = list(extract(document, base_url=base_url))
        except Exception:
            if errors == 'log':
                logger.exception('Failed to extract {}'.format(label))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise

    if uniform:
        if 'microdata' in syntaxes:
            output['microdata'] = _umicrodata_microformat(
                output['microdata'], schema_context=schema_context)
        if 'microformat' in syntaxes:
            output['microformat'] = _umicrodata_microformat(
                output['microformat'],
                schema_context='http://microformats.org/wiki/')
        if 'opengraph' in syntaxes:
            output['opengraph'] = _uopengraph(output['opengraph'])
    return output
Ejemplo n.º 3
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            return_html_node=False,
            schema_context='http://schema.org',
            with_og_array=False,
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       return_html_node: if True, it includes into the result a HTML node of
                         respective embedded metadata under 'htmlNode' key.
                         The feature is supported only by microdata syntax.
                         Each node is of `lxml.etree.Element` type.
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning,
                      stacklevel=2)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    try:
        tree = parse_xmldom_html(htmlstring, encoding=encoding)
    except Exception as e:
        if errors == 'ignore':
            return {}
        if errors == 'log':
            logger.exception('Failed to parse html, raises {}'.format(e))
            return {}
        if errors == 'strict':
            raise
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata',
             MicrodataExtractor(add_html_node=return_html_node).extract_items,
             tree))
    if 'json-ld' in syntaxes:
        processors.append((
            'json-ld',
            JsonLdExtractor().extract_items,
            tree,
        ))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append((
            'rdfa',
            RDFaExtractor().extract_items,
            tree,
        ))
    if 'dublincore' in syntaxes:
        processors.append((
            'dublincore',
            DublinCoreExtractor().extract_items,
            tree,
        ))
    output = {}
    for syntax, extract, document in processors:
        try:
            output[syntax] = list(extract(document, base_url=base_url))
        except Exception as e:
            if errors == 'log':
                logger.exception('Failed to extract {}, raises {}'.format(
                    syntax, e))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise
    if uniform:
        uniform_processors = []
        if 'microdata' in syntaxes:
            uniform_processors.append((
                'microdata',
                _umicrodata_microformat,
                output['microdata'],
                schema_context,
            ))
        if 'microformat' in syntaxes:
            uniform_processors.append((
                'microformat',
                _umicrodata_microformat,
                output['microformat'],
                'http://microformats.org/wiki/',
            ))
        if 'opengraph' in syntaxes:
            uniform_processors.append((
                'opengraph',
                _uopengraph,
                output['opengraph'],
                None,
            ))
        if 'dublincore' in syntaxes:
            uniform_processors.append((
                'dublincore',
                _udublincore,
                output['dublincore'],
                None,
            ))

        for syntax, uniform, raw, schema_context in uniform_processors:
            try:
                if syntax == 'opengraph':
                    output[syntax] = uniform(raw, with_og_array=with_og_array)
                elif syntax == 'dublincore':
                    output[syntax] = uniform(raw)
                else:
                    output[syntax] = uniform(raw, schema_context)
            except Exception as e:
                if errors == 'ignore':
                    output[syntax] = []
                if errors == 'log':
                    output[syntax] = []
                    logger.exception(
                        'Failed to uniform extracted for {}, raises {}'.format(
                            syntax, e))
                if errors == 'strict':
                    raise

    return output