def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get('FormalName', '') parsed_el = xml.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len(language) else '' keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject') item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring( xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len(characteristics) else None parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse_content(self, item, xml): item["body_html"] = ( etree.tostring( xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"), encoding="unicode", ) .replace("<body.content>", "") .replace("</body.content>", "") )
def get_word_count(html): """Get word count for given html. :param html: html string to count """ try: root = etree.fromstringlist('<doc>{0}</doc>'.format(html)) text = etree.tostring(root, encoding='unicode', method='text') return get_text_word_count(text) except ParseError: return get_text_word_count(html)
def parse_content(self, item, xml): components = xml.findall("NewsItem/NewsComponent/NewsComponent/NewsComponent") for component in components: role = component.find("Role") if role is None: continue dest = self.COMPONENT_ROLE_MAPPING.get(role.get("FormalName")) if not dest: continue body = component.find( "ContentItem/DataContent/xhtml:html/xhtml:body", namespaces=NS ) if dest == "headline": item[dest] = etree.tostring( body, encoding="unicode", method="text" ).strip() elif dest == "abstract": item[dest] = component.find("ContentItem/DataContent").text else: item[dest] = "\n".join( [ etree.tostring(elem, encoding="unicode", method="html").replace( ' xmlns="http://www.w3.org/1999/xhtml"', "" ) for elem in body ] ) party = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source/Party") if party is not None and party.get("FormalName"): item.setdefault("subject", []).append( { "name": party.get("FormalName"), "qcode": party.get("FormalName"), "scheme": cp.ORGANISATION, } )
def get_body(self, news_item): try: raw_content = news_item.xpath( 'NewsComponent/ContentItem[@Euid="announcement_html"]/DataContent/text()' )[0] except IndexError: logger.warning("No content found in element: {xml}".format( xml=etree.tostring(news_item, encoding="unicode"))) return "" content_elt = sd_etree.parse_html(raw_content) h1 = content_elt.find('h1') if h1 is not None: content_elt.remove(h1) categories = news_item.xpath( 'NewsComponent/Metadata/Property[@FormalName="Message Category"]/@Value' ) if categories: category = categories[0] p_elt = etree.Element('p') p_elt.text = category content_elt.insert(0, p_elt) ori_ann_urls = news_item.xpath( 'NewsComponent/Metadata/Property[@FormalName="nordicAgencyWebsite"]/@Value' ) if ori_ann_urls: url = ori_ann_urls[0] if not url.startswith('http'): raise ValueError("Invalid url: {url}".format(url=url)) p_elt = etree.SubElement(content_elt, "p") p_elt.text = 'Se saken i sin helhet: ' a_elt = etree.SubElement(p_elt, "a", attrib={'href': url}) a_elt.text = url ret = sd_etree.to_string(content_elt) return ret
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party") if parsed_el is not None: item["original_source"] = parsed_el.attrib.get( "FormalName", "ANA") parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( "NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len( language) else "" subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item["subject"] = self.format_subjects(subjects) item["body_html"] = (html.unescape( etree.tostring(xml.find( "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent" ), encoding="unicode")).replace( "<DataContent>", "").replace("</DataContent>", "").replace( "<P>", "<p>").replace("</P>", "</p>")) item["body_html"] = (item.get("body_html").replace( "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο " "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον " "για συγκεκριμένη χρήση.</p>", "", ).strip()) parsed_el = xml.findall( "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property" ) characteristics = self.parse_attribute_values( parsed_el, "WordCount") item["word_count"] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get("Value") # Anglicise the greek for Athens if required city = "Athens" if city == "Αθήνα" else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get("Value") # Normalise the country code country = "GR" if country == "GRC" else country cities = app.locators.find_cities() located = [ c for c in cities if c["city"] == city and c["country_code"] == country ] if len(located) == 1: item["dateline"]["located"] = located[0] item["dateline"]["source"] = provider.get("source") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], item.get("dateline", {}).get("date"), provider.get("source")) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = { 'versioncreated': utcnow(), 'anpa_category': [{ "name": "Formidlingstjenester", "qcode": "r" }], 'genre': [{ "name": "Fulltekstmeldinger", "qcode": "Fulltekstmeldinger", "scheme": "genre_custom" }], 'subject': [{ 'qcode': 'Børsmelding', 'name': 'Børsmelding', 'scheme': 'category' }], 'ednote': '*** Dette er en børsmelding formidlet av NTB pva. andre ***' } self.populate_fields(item) try: # we remove newsml namespace for convenience (to avoid to write prefix each time) # we deepcopy first to avoid modifying original item xml = deepcopy(xml) for elt in xml.iter(): elt.tag = elt.tag.replace('{' + NEWSML_NS + '}', '') news_items = xml.findall('NewsItem') # there may be several items (for different languages), we keep in order of # preference: Norwegian, English, first item (cf. SDNTB-573) selected = None for news_item in news_items: try: lang = news_item.xpath( 'NewsComponent/DescriptiveMetadata/Language/@FormalName', )[0] except IndexError: logger.warning( "missing language in item, ignoring it.\nxml: {xml}". format( xml=etree.tostring(news_item, encoding="unicode"))) continue if selected is None or lang in ('no', 'en'): selected = news_item if lang == 'no': break if selected is None: logger.warning("can't find any valid item\nxml={xml}".format( xml=etree.tostring(news_item, encoding="unicode"))) raise ParserError.parseFileError( source=etree.tostring(xml, encoding="unicode")) self.do_mapping(item, selected) return [item] except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( 'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party') if parsed_el is not None: item['original_source'] = parsed_el.attrib.get( 'FormalName', 'ANA') parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len( language) else '' subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item['subject'] = self.format_subjects(subjects) item['body_html'] = html.unescape( etree.tostring(xml.find( 'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent' ), encoding='unicode')).replace( '<DataContent>', '').replace('</DataContent>', '').replace( '<P>', '<p>').replace('</P>', '</p>') item['body_html'] = item.get('body_html').replace( '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο ' 'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον ' 'για συγκεκριμένη χρήση.</p>', '').strip() parsed_el = xml.findall( 'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property' ) characteristics = self.parse_attribute_values( parsed_el, 'WordCount') item['word_count'] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get('Value') # Anglicise the greek for Athens if required city = 'Athens' if city == 'Αθήνα' else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get('Value') # Normalise the country code country = 'GR' if country == 'GRC' else country cities = app.locators.find_cities() located = [ c for c in cities if c['city'] == city and c['country_code'] == country ] if len(located) == 1: item['dateline']['located'] = located[0] item['dateline']['source'] = provider.get('source') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], item.get('dateline', {}).get('date'), provider.get('source')) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( 'NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get( 'FormalName', '') parsed_el = xml.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority( parsed_el.text if parsed_el else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len( language) else '' keywords = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject' ) item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring(xml.find( 'NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content' ), encoding='unicode').replace( '<body.content>', '').replace( '</body.content>', '') parsed_el = xml.findall( 'NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len( characteristics) else None parsed_el = xml.find( 'NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parser_contentitem(self, item, content_el): """ Function parser DescriptiveMetadata in NewsComponent element. Example: <ContentItem> <MediaType FormalName="Text"/> <Format FormalName="NITF3.1"/> <Characteristics> <SizeInBytes>2520</SizeInBytes> <Property FormalName="Words" Value="420"/> </Characteristics> <DataContent> <nitf> <body> <body.content> <p>Un an après la mort de Johnny Hallyday, plus d'un millier de fans sont venus assister dimanche <p>A l'intérieur de l'église, plus d'un millier de personnes étaient réunies pour assister à une <p> <org idsrc="isin" value="US38259P5089">GOOGLE</org> </p> </body.content> </body> </nitf> </DataContent> </ContentItem> :param item: :param content_el: :return: """ if content_el is None: return element = content_el.find('MediaType') if element is not None: item['type'] = element.get('FormalName', '') element = content_el.find('MimeType') if element is not None: item['mimetype'] = element.get('FormalName', '') element = content_el.find('Format') if element is not None: item['format'] = element.get('FormalName', '') character_el = content_el.find('Characteristics') if character_el is not None: item['characteristics'] = {} element = character_el.find('SizeInBytes') if element is not None: item['characteristics']['size_bytes'] = element.text elements = character_el.findall('Property') for element in elements: if element.attrib.get('FormalName') == 'Words': item['characteristics']['word_count'] = element.attrib.get( 'Value') if element.attrib.get('FormalName') == 'SizeInBytes': item['characteristics']['size_bytes'] = element.attrib.get( 'Value') if element.attrib.get('FormalName') == 'Creator': item['characteristics']['creator'] = element.attrib.get( 'Value') if element.attrib.get('FormalName') == 'Characters': item['characteristics']['characters'] = element.attrib.get( 'Value') if content_el.find('DataContent/nitf/body/body.content') is not None: item['body_html'] = etree.tostring( content_el.find('DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') if content_el.find('DataContent/nitf/head') is not None: item['header_content'] = etree.tostring( content_el.find('DataContent/nitf/head'), encoding='unicode') if content_el.find('DataContent/nitf/body/body.head') is not None: item['body_head'] = etree.tostring( content_el.find('DataContent/nitf/body/body.head'), encoding='unicode')
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source") if parsed_el is not None: item["original_source"] = parsed_el.find("Party").get("FormalName", "") parsed_el = xml.find("NewsEnvelope/TransmissionId") if parsed_el is not None: item["ingest_provider_sequence"] = parsed_el.text parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority(parsed_el.text if parsed_el else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len(language) else "" keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property") item["keywords"] = self.parse_attribute_values(keywords, "Keyword") subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject") item["subject"] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item["body_html"] = ( etree.tostring( xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"), encoding="unicode", ) .replace("<body.content>", "") .replace("</body.content>", "") ) parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property") characteristics = self.parse_attribute_values(parsed_el, "Words") item["word_count"] = characteristics[0] if len(characteristics) else None parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType") if parsed_el is not None: item.setdefault("usageterms", parsed_el.text) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre") if parsed_el is not None: item["genre"] = [] for el in parsed_el: item["genre"].append({"name": el.get("FormalName")}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)