def parse_el(el, ctx): """Parse an element for microformats """ classes = el.get("class", []) # Workaround for bs4+html5lib bug that # prevents it from recognizing multi-valued # attrs on the <html> element # https://bugs.launchpad.net/beautifulsoup/+bug/1296481 # don't need anymore remove? if el.name == 'html' and not isinstance(classes, list): classes = classes.split() # find potential microformats in root classnames h-* potential_microformats = mf2_classes.root(classes) # if potential microformats found parse them if potential_microformats: result = handle_microformat(potential_microformats, el) ctx.append(result) else: # find backcompat root classnames potential_microformats = backcompat.root(classes) if potential_microformats: result = handle_microformat(potential_microformats, el, backcompat_mode=True) ctx.append(result) else: # parse child tags for child in get_children(el): parse_el(child, ctx)
def parse_el(el, ctx, top_level=False): """Parse an element for microformats """ classes = el.get("class", []) # Workaround for bs4+html5lib bug that # prevents it from recognizing multi-valued # attrs on the <html> element # https://bugs.launchpad.net/beautifulsoup/+bug/1296481 if el.name == 'html' and not isinstance(classes, list): classes = classes.split() # find potential microformats in root classnames h-* potential_microformats = mf2_classes.root(classes) # if potential microformats found parse them if potential_microformats: result = handle_microformat(potential_microformats, el) ctx.append(result) else: # parse child tags for child in get_children(el): parse_el(child, ctx)
def parse_props(el): """Parse the properties from a single element """ props = self.dict_class() children = [] classes = el.get("class", []) # Is this element a microformat root? root_class_names = mf2_classes.root(classes) # Is this a property element (p-*, u-*, etc.) is_property_el = False # Parse plaintext p-* properties. p_value = None for prop_name in mf2_classes.text(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if p_value is None: p_value = text_type(parse_property.text(el).strip()) if root_class_names: prop_value.append(handle_microformat( root_class_names, el, value_property="name", simple_value=p_value)) else: prop_value.append(p_value) # Parse URL u-* properties. u_value = None for prop_name in mf2_classes.url(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if u_value is None: u_value = parse_property.url(el, base_url=self.__url__) if root_class_names: prop_value.append(handle_microformat( root_class_names, el, value_property="url", simple_value=u_value)) else: prop_value.append(text_type(u_value)) # Parse datetime dt-* properties. dt_value = None for prop_name in mf2_classes.datetime(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if dt_value is None: dt_value, new_date = parse_property.datetime( el, self._default_date) # update the default date if new_date: self._default_date = new_date if root_class_names: prop_value.append(handle_microformat( root_class_names, el, simple_value=text_type(dt_value))) else: if dt_value is not None: prop_value.append(text_type(dt_value)) # Parse embedded markup e-* properties. e_value = None for prop_name in mf2_classes.embedded(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if e_value is None: e_value = parse_property.embedded(el) if root_class_names: prop_value.append(handle_microformat( root_class_names, el, simple_value=e_value)) else: prop_value.append(e_value) # if this is not a property element, but it is a h-* microformat, # add it to our list of children if not is_property_el and root_class_names: children.append(handle_microformat(root_class_names, el)) # parse child tags, provided this isn't a microformat root-class if not root_class_names: for child in get_children(el): child_properties, child_microformats = parse_props(child) for prop_name in child_properties: v = props.get(prop_name, []) v.extend(child_properties[prop_name]) props[prop_name] = v children.extend(child_microformats) return props, children
def handle_microformat(root_class_names, el, value_property=None, simple_value=None): """Handles a (possibly nested) microformat, i.e. h-* """ properties = self.dict_class() children = [] self._default_date = None # parse for properties and children for child in get_children(el): child_props, child_children = parse_props(child) for key, new_value in child_props.items(): prop_value = properties.get(key, []) prop_value.extend(new_value) properties[key] = prop_value children.extend(child_children) # complex h-* objects can take their "value" from the # first explicit property ("name" for p-* or "url" for u-*) if value_property and value_property in properties: simple_value = properties[value_property][0] # if some properties not already found find in implied ways if "name" not in properties: properties["name"] = [text_type(prop) for prop in implied_properties.name(el)] if "photo" not in properties: x = implied_properties.photo(el, base_url=self.__url__) if x is not None: properties["photo"] = [text_type(u) for u in x] if "url" not in properties: x = implied_properties.url(el, base_url=self.__url__) if x is not None: properties["url"] = [text_type(u) for u in x] # build microformat with type and properties microformat = self.dict_class([ ("type", [text_type(class_name) for class_name in root_class_names]), ("properties", properties), ]) if str(el.name) == "area": shape = get_attr(el, 'shape') if shape is not None: microformat['shape'] = text_type(shape) coords = get_attr(el, 'coords') if coords is not None: microformat['coords'] = text_type(coords) # insert children if any if children: microformat["children"] = children # simple value is the parsed property value if it were not # an h-* class if simple_value is not None: if isinstance(simple_value, dict): # for e-* properties, the simple value will be # {"html":..., "value":...} which we should fold # into the microformat object # details: https://github.com/tommorris/mf2py/issues/35 microformat.update(simple_value) else: microformat["value"] = text_type(simple_value) return microformat
def parse_props(el): """Parse the properties from a single element """ props = self.dict_class() children = [] classes = el.get("class", []) # Is this element a microformat root? root_class_names = mf2_classes.root(classes) # Is this a property element (p-*, u-*, etc.) is_property_el = False # Parse plaintext p-* properties. p_value = None for prop_name in mf2_classes.text(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if p_value is None: p_value = text_type(parse_property.text(el).strip()) if root_class_names: prop_value.append( handle_microformat(root_class_names, el, value_property="name", simple_value=p_value)) else: prop_value.append(p_value) # Parse URL u-* properties. u_value = None for prop_name in mf2_classes.url(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if u_value is None: u_value = parse_property.url(el, base_url=self.__url__) if root_class_names: prop_value.append( handle_microformat(root_class_names, el, value_property="url", simple_value=u_value)) else: prop_value.append(text_type(u_value)) # Parse datetime dt-* properties. dt_value = None for prop_name in mf2_classes.datetime(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if dt_value is None: dt_value, new_date = parse_property.datetime( el, self._default_date) # update the default date if new_date: self._default_date = new_date if root_class_names: prop_value.append( handle_microformat(root_class_names, el, simple_value=text_type(dt_value))) else: if dt_value is not None: prop_value.append(text_type(dt_value)) # Parse embedded markup e-* properties. e_value = None for prop_name in mf2_classes.embedded(classes): is_property_el = True prop_value = props.setdefault(prop_name, []) # if value has not been parsed then parse it if e_value is None: e_value = parse_property.embedded(el) if root_class_names: prop_value.append( handle_microformat(root_class_names, el, simple_value=e_value)) else: prop_value.append(e_value) # if this is not a property element, but it is a h-* microformat, # add it to our list of children if not is_property_el and root_class_names: children.append(handle_microformat(root_class_names, el)) # parse child tags, provided this isn't a microformat root-class if not root_class_names: for child in get_children(el): child_properties, child_microformats = parse_props(child) for prop_name in child_properties: v = props.get(prop_name, []) v.extend(child_properties[prop_name]) props[prop_name] = v children.extend(child_microformats) return props, children
def handle_microformat(root_class_names, el, value_property=None, simple_value=None): """Handles a (possibly nested) microformat, i.e. h-* """ properties = self.dict_class() children = [] self._default_date = None # parse for properties and children for child in get_children(el): child_props, child_children = parse_props(child) for key, new_value in child_props.items(): prop_value = properties.get(key, []) prop_value.extend(new_value) properties[key] = prop_value children.extend(child_children) # complex h-* objects can take their "value" from the # first explicit property ("name" for p-* or "url" for u-*) if value_property and value_property in properties: simple_value = properties[value_property][0] # if some properties not already found find in implied ways if "name" not in properties: properties["name"] = [ text_type(prop) for prop in implied_properties.name(el) ] if "photo" not in properties: x = implied_properties.photo(el, base_url=self.__url__) if x is not None: properties["photo"] = [text_type(u) for u in x] if "url" not in properties: x = implied_properties.url(el, base_url=self.__url__) if x is not None: properties["url"] = [text_type(u) for u in x] # build microformat with type and properties microformat = self.dict_class([ ("type", [text_type(class_name) for class_name in root_class_names]), ("properties", properties), ]) if str(el.name) == "area": shape = get_attr(el, 'shape') if shape is not None: microformat['shape'] = text_type(shape) coords = get_attr(el, 'coords') if coords is not None: microformat['coords'] = text_type(coords) # insert children if any if children: microformat["children"] = children # simple value is the parsed property value if it were not # an h-* class if simple_value is not None: if isinstance(simple_value, dict): # for e-* properties, the simple value will be # {"html":..., "value":...} which we should fold # into the microformat object # details: https://github.com/tommorris/mf2py/issues/35 microformat.update(simple_value) else: microformat["value"] = text_type(simple_value) return microformat