def parse(self, element, *args, **kwargs): list_elements = [] def _add_list_element(parsed_list_item, check_field): if isinstance(parsed_list_item, dict) and parsed_list_item.get(check_field): list_elements.append(parsed_list_item) if element.name == 'ul': list_type = 'unordered' else: list_type = 'ordered' for list_item in element.contents: if isinstance(list_item, Tag) and list_item.contents: child_lists = list_item.find_all(self.applicable_elements) if child_lists: for child in child_lists: _add_list_element(self.parse(child).output, "items") else: if self.list_item_parser.is_applicable(list_item): _add_list_element(self.list_item_parser.parse(list_item).output, "content") result = self.construct_output(element, "list") result["list_type"] = list_type result["items"] = list_elements return ParseResult(result, True)
def parse(self, element, *args, **kwargs): if element.name == 'ul': list_type = 'unordered' else: list_type = 'ordered' list_elements = [] for list_item in element.contents: # as of ANS 0.6.2, has to be either text or another list # https://github.com/washingtonpost/ans-schema/blob/master/src/main/resources/schema/ans/0.6.2/story_elements/list_element.json if isinstance(list_item, Tag) and list_item.contents: # we could do something complicated so that this parser has its own sub-parsers # but for the time being, assume the list items are fairly simple # if this item is wrapping something else, just take the first child list_item = list_item.contents[0] if isinstance(list_item, Tag) and list_item.name in self.applicable_elements: parsed_list_item = self.parse(list_item).output field_check = "items" else: parsed_list_item = ParagraphParser().parse(list_item).output field_check = "content" if isinstance(parsed_list_item, dict) and parsed_list_item.get(field_check): list_elements.append(parsed_list_item) result = self.construct_output(element, "list") result["list_type"] = list_type result["items"] = list_elements return ParseResult(result, True)
def parse(self, element, *args, **kwargs): result = None match = False if self.is_text_only(element): match = True result = self.construct_output(element) return ParseResult(result, match)
def parse(self, element, *args, **kwargs): result = None content = element.text if content: result = self.construct_output(element, "header", content) result["level"] = int(element.name[1:len(element.name)]) return ParseResult(result, True)
def parse(self, element, *args, **kwargs): result = None match = False content_elements = [] if self.is_text_only(element): match = True p_elements = element.find_all('p') if p_elements: for p_element in p_elements: # don't want to process oembed (twitter, inselementram, etc) as blockquotes, # so only consider text # technically since v0.6.0 we could put in the oembeds but shouldn't by default if not self.is_empty(p_element): p_result = ParagraphParser().parse(p_element) if p_result.match and p_result.output: content_elements.append(p_result.output) else: p_result = ParagraphParser().parse(element) if p_result.match and p_result.output: content_elements.append(p_result.output) content_elements = [el for el in content_elements if el is not None] if match and content_elements: result = { "type": "quote", "content_elements": content_elements } return ParseResult(result, match)
def parse(self, tag): return ParseResult([{ 'type': 'foo', 'bar': "dummy words" }, { 'type': 'foo', 'bar': "others" }], True)
def parse(self, element, *args, **kwargs): result = None match = False if self.is_text_only(element): match = True content = self.construct_output(element) if isinstance(content, list) or (isinstance(content, dict) and content.get("content")): result = content return ParseResult(result, match)
def parse(self, element, *args, **kwargs): result = None match = False if self.is_text_only(element): match = True result = self.construct_output(element) if isinstance(result, dict): result["content"] = "<{}>{}</{}>".format( element.name, result.get("content"), element.name) return ParseResult(result, match)
def parse(self, element, *args, **kwargs): result = self.construct_output(element, "interstitial_link", element.text) match = True url = element.attrs.get('href') result["url"] = url content = result.get("content") if not (url and content): result = None match = True return ParseResult(result, match)
def parse(self, element, *args, **kwargs): img_element = element.find('img') if img_element: parsed_result = super(LinkedImageParser, self).parse(img_element) if parsed_result.output: image_link = element.attrs.get('href') if image_link: parsed_result.output["additional_properties"] = { "image_link": image_link } return parsed_result return ParseResult(None, False)
def parse(self, element, *args, **kwargs): result = None match = True img_element = element.find('img') if img_element: parsed_result = super(FigureParser, self).parse(img_element) if parsed_result.output: caption = self._parse_caption(element) if caption: parsed_result.output['caption'] = caption return parsed_result return ParseResult(result, match)
def parse(self, element, *args, **kwargs): result = super(IFrameParser, self).parse(element).output if element.name == 'iframe': iframe_tag = element else: iframe_tag = element.find('iframe') add_props = result.get("additional_properties", {}) if iframe_tag.attrs: add_props.update(iframe_tag.attrs) parse_dimensions(iframe_tag, add_props, ['height']) if add_props: result["additional_properties"] = add_props return ParseResult(result, True)
def parse(self, element, *args, **kwargs): result = None match = False source_element = element.find('source') if source_element: source_url = source_element.attrs.get('src') if source_url: result = self.construct_output(element, "audio") result["streams"] = [{ # escapes white space in the URL "url": source_url }] match = True return ParseResult(result, match)
def parse(self, element, *args, **kwargs): result = None match = False source_element = element.find('source') if source_element: source_url = source_element.attrs.get('src') if source_url: result = self.construct_output(element, "audio") result["streams"] = [{ # url encodes any illegal url characters "url": self._create_encoded_url(source_url) }] match = True return ParseResult(result, match)
def _attempt_element_parse(self, element, parser_candidates): """ Tries to parse the given ``element`` using each parser in ``parser_candidates`` until one is a match. If the parser has ``version_required`` set to true, the ANS version is added to the output. """ for parser in parser_candidates: try: if parser.is_applicable(element): parser_result = parser.parse(element) if parser_result.match: if parser_result.output and parser.version_required: parser_result.output.setdefault("version", self.ans_version) return parser_result except Exception as exc: if self.suppress_exceptions: continue else: raise ParsingException(exc) return ParseResult(None, False)
def parse(self, element, *args, **kwargs): result = None match = False tag_id = self.get_tag_id(element) if not tag_id: sub_tags = element.find_all(self.tag) for sub_tag in sub_tags: sub_id = self.get_tag_id(sub_tag) if sub_id: tag_id = sub_id break if tag_id: match = True result = self.construct_output(element, "reference") result["referent"] = { "provider": self.provider, "type": self.embed_type, "id": unquote(tag_id).replace("http://", "https://") } if self.embed_type not in self.non_oembed_service_types: result['referent']['service'] = 'oembed' # many embeds come with a script that should be removed self._remove_embed_script(element) return ParseResult(result, match)
def parse(self, element, *args, **kwargs): return ParseResult( self.construct_output(element, "raw_html", six.text_type(element)), True)
def parse(self, tag): return ParseResult({"type": "foo", "bar": tag.text}, True)
def parse(self, element, *args, **kwargs): if element.attrs.get('src'): return ParseResult(self.construct_output(element), True) return ParseResult(None, True)