Example #1
0
        def parse_el(el, ctx):
            """Parse an element for microformats
            """
            classes = el.get("class", [])

            # Workaround for bs4+html5lib bug that
            # prevents it from recognizing multi-valued
            # attrs on the <html> element
            # https://bugs.launchpad.net/beautifulsoup/+bug/1296481
            # don't need anymore remove?
            if el.name == 'html' and not isinstance(classes, list):
                classes = classes.split()

            # find potential microformats in root classnames h-*
            potential_microformats = mf2_classes.root(classes)

            # if potential microformats found parse them
            if potential_microformats:
                result = handle_microformat(potential_microformats, el)
                ctx.append(result)
            else:
                # find backcompat root classnames
                potential_microformats = backcompat.root(classes)
                if potential_microformats:
                    result = handle_microformat(potential_microformats, el, backcompat_mode=True)
                    ctx.append(result)
                else:
                    # parse child tags
                    for child in get_children(el):
                        parse_el(child, ctx)
Example #2
0
        def parse_el(el, ctx, top_level=False):
            """Parse an element for microformats
            """
            classes = el.get("class", [])

            # Workaround for bs4+html5lib bug that
            # prevents it from recognizing multi-valued
            # attrs on the <html> element
            # https://bugs.launchpad.net/beautifulsoup/+bug/1296481
            if el.name == 'html' and not isinstance(classes, list):
                classes = classes.split()

            # find potential microformats in root classnames h-*
            potential_microformats = mf2_classes.root(classes)

            # if potential microformats found parse them
            if potential_microformats:
                result = handle_microformat(potential_microformats, el)
                ctx.append(result)
            else:
                # parse child tags
                for child in get_children(el):
                    parse_el(child, ctx)
Example #3
0
        def parse_props(el):
            """Parse the properties from a single element
            """
            props = self.dict_class()
            children = []

            classes = el.get("class", [])
            # Is this element a microformat root?
            root_class_names = mf2_classes.root(classes)
            # Is this a property element (p-*, u-*, etc.)
            is_property_el = False

            # Parse plaintext p-* properties.
            p_value = None
            for prop_name in mf2_classes.text(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if p_value is None:
                    p_value = text_type(parse_property.text(el).strip())

                if root_class_names:
                    prop_value.append(handle_microformat(
                        root_class_names, el, value_property="name",
                        simple_value=p_value))
                else:
                    prop_value.append(p_value)

            # Parse URL u-* properties.
            u_value = None
            for prop_name in mf2_classes.url(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if u_value is None:
                    u_value = parse_property.url(el, base_url=self.__url__)

                if root_class_names:
                    prop_value.append(handle_microformat(
                        root_class_names, el, value_property="url",
                        simple_value=u_value))
                else:
                    prop_value.append(text_type(u_value))

            # Parse datetime dt-* properties.
            dt_value = None
            for prop_name in mf2_classes.datetime(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if dt_value is None:
                    dt_value, new_date = parse_property.datetime(
                        el, self._default_date)
                    # update the default date
                    if new_date:
                        self._default_date = new_date

                if root_class_names:
                    prop_value.append(handle_microformat(
                        root_class_names, el,
                        simple_value=text_type(dt_value)))
                else:
                    if dt_value is not None:
                        prop_value.append(text_type(dt_value))

            # Parse embedded markup e-* properties.
            e_value = None
            for prop_name in mf2_classes.embedded(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if e_value is None:
                    e_value = parse_property.embedded(el)

                if root_class_names:
                    prop_value.append(handle_microformat(
                        root_class_names, el, simple_value=e_value))
                else:
                    prop_value.append(e_value)

            # if this is not a property element, but it is a h-* microformat,
            # add it to our list of children
            if not is_property_el and root_class_names:
                children.append(handle_microformat(root_class_names, el))

            # parse child tags, provided this isn't a microformat root-class
            if not root_class_names:
                for child in get_children(el):
                    child_properties, child_microformats = parse_props(child)
                    for prop_name in child_properties:
                        v = props.get(prop_name, [])
                        v.extend(child_properties[prop_name])
                        props[prop_name] = v
                    children.extend(child_microformats)

            return props, children
Example #4
0
        def handle_microformat(root_class_names, el, value_property=None,
                               simple_value=None):
            """Handles a (possibly nested) microformat, i.e. h-*
            """
            properties = self.dict_class()
            children = []
            self._default_date = None

            # parse for properties and children
            for child in get_children(el):
                child_props, child_children = parse_props(child)
                for key, new_value in child_props.items():
                    prop_value = properties.get(key, [])
                    prop_value.extend(new_value)
                    properties[key] = prop_value
                children.extend(child_children)

            # complex h-* objects can take their "value" from the
            # first explicit property ("name" for p-* or "url" for u-*)
            if value_property and value_property in properties:
                simple_value = properties[value_property][0]

            # if some properties not already found find in implied ways
            if "name" not in properties:
                properties["name"] = [text_type(prop)
                                      for prop
                                      in implied_properties.name(el)]
            if "photo" not in properties:
                x = implied_properties.photo(el, base_url=self.__url__)
                if x is not None:
                    properties["photo"] = [text_type(u) for u in x]

            if "url" not in properties:
                x = implied_properties.url(el, base_url=self.__url__)
                if x is not None:
                    properties["url"] = [text_type(u) for u in x]

            # build microformat with type and properties
            microformat = self.dict_class([
                ("type", [text_type(class_name)
                          for class_name in root_class_names]),
                ("properties", properties),
            ])
            if str(el.name) == "area":
                shape = get_attr(el, 'shape')
                if shape is not None:
                    microformat['shape'] = text_type(shape)

                coords = get_attr(el, 'coords')
                if coords is not None:
                    microformat['coords'] = text_type(coords)

            # insert children if any
            if children:
                microformat["children"] = children
            # simple value is the parsed property value if it were not
            # an h-* class
            if simple_value is not None:
                if isinstance(simple_value, dict):
                    # for e-* properties, the simple value will be
                    # {"html":..., "value":...}  which we should fold
                    # into the microformat object
                    # details: https://github.com/tommorris/mf2py/issues/35
                    microformat.update(simple_value)
                else:
                    microformat["value"] = text_type(simple_value)

            return microformat
Example #5
0
File: parser.py Project: sgml/mf2py
        def parse_props(el):
            """Parse the properties from a single element
            """
            props = self.dict_class()
            children = []

            classes = el.get("class", [])
            # Is this element a microformat root?
            root_class_names = mf2_classes.root(classes)
            # Is this a property element (p-*, u-*, etc.)
            is_property_el = False

            # Parse plaintext p-* properties.
            p_value = None
            for prop_name in mf2_classes.text(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if p_value is None:
                    p_value = text_type(parse_property.text(el).strip())

                if root_class_names:
                    prop_value.append(
                        handle_microformat(root_class_names,
                                           el,
                                           value_property="name",
                                           simple_value=p_value))
                else:
                    prop_value.append(p_value)

            # Parse URL u-* properties.
            u_value = None
            for prop_name in mf2_classes.url(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if u_value is None:
                    u_value = parse_property.url(el, base_url=self.__url__)

                if root_class_names:
                    prop_value.append(
                        handle_microformat(root_class_names,
                                           el,
                                           value_property="url",
                                           simple_value=u_value))
                else:
                    prop_value.append(text_type(u_value))

            # Parse datetime dt-* properties.
            dt_value = None
            for prop_name in mf2_classes.datetime(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if dt_value is None:
                    dt_value, new_date = parse_property.datetime(
                        el, self._default_date)
                    # update the default date
                    if new_date:
                        self._default_date = new_date

                if root_class_names:
                    prop_value.append(
                        handle_microformat(root_class_names,
                                           el,
                                           simple_value=text_type(dt_value)))
                else:
                    if dt_value is not None:
                        prop_value.append(text_type(dt_value))

            # Parse embedded markup e-* properties.
            e_value = None
            for prop_name in mf2_classes.embedded(classes):
                is_property_el = True
                prop_value = props.setdefault(prop_name, [])

                # if value has not been parsed then parse it
                if e_value is None:
                    e_value = parse_property.embedded(el)

                if root_class_names:
                    prop_value.append(
                        handle_microformat(root_class_names,
                                           el,
                                           simple_value=e_value))
                else:
                    prop_value.append(e_value)

            # if this is not a property element, but it is a h-* microformat,
            # add it to our list of children
            if not is_property_el and root_class_names:
                children.append(handle_microformat(root_class_names, el))

            # parse child tags, provided this isn't a microformat root-class
            if not root_class_names:
                for child in get_children(el):
                    child_properties, child_microformats = parse_props(child)
                    for prop_name in child_properties:
                        v = props.get(prop_name, [])
                        v.extend(child_properties[prop_name])
                        props[prop_name] = v
                    children.extend(child_microformats)

            return props, children
Example #6
0
File: parser.py Project: sgml/mf2py
        def handle_microformat(root_class_names,
                               el,
                               value_property=None,
                               simple_value=None):
            """Handles a (possibly nested) microformat, i.e. h-*
            """
            properties = self.dict_class()
            children = []
            self._default_date = None

            # parse for properties and children
            for child in get_children(el):
                child_props, child_children = parse_props(child)
                for key, new_value in child_props.items():
                    prop_value = properties.get(key, [])
                    prop_value.extend(new_value)
                    properties[key] = prop_value
                children.extend(child_children)

            # complex h-* objects can take their "value" from the
            # first explicit property ("name" for p-* or "url" for u-*)
            if value_property and value_property in properties:
                simple_value = properties[value_property][0]

            # if some properties not already found find in implied ways
            if "name" not in properties:
                properties["name"] = [
                    text_type(prop) for prop in implied_properties.name(el)
                ]
            if "photo" not in properties:
                x = implied_properties.photo(el, base_url=self.__url__)
                if x is not None:
                    properties["photo"] = [text_type(u) for u in x]

            if "url" not in properties:
                x = implied_properties.url(el, base_url=self.__url__)
                if x is not None:
                    properties["url"] = [text_type(u) for u in x]

            # build microformat with type and properties
            microformat = self.dict_class([
                ("type",
                 [text_type(class_name) for class_name in root_class_names]),
                ("properties", properties),
            ])
            if str(el.name) == "area":
                shape = get_attr(el, 'shape')
                if shape is not None:
                    microformat['shape'] = text_type(shape)

                coords = get_attr(el, 'coords')
                if coords is not None:
                    microformat['coords'] = text_type(coords)

            # insert children if any
            if children:
                microformat["children"] = children
            # simple value is the parsed property value if it were not
            # an h-* class
            if simple_value is not None:
                if isinstance(simple_value, dict):
                    # for e-* properties, the simple value will be
                    # {"html":..., "value":...}  which we should fold
                    # into the microformat object
                    # details: https://github.com/tommorris/mf2py/issues/35
                    microformat.update(simple_value)
                else:
                    microformat["value"] = text_type(simple_value)

            return microformat