from superdesk.io.feed_parsers.nitf import NITFFeedParser from superdesk.io import register_feed_parser class PAFeedParser(NITFFeedParser): """ NITF Parser extension for Press Association, it maps the category meta tag to an anpa category """ NAME = 'pa_nitf' def _category_mapping(self, elem): """ Map the category supplied by PA to a best guess anpa_category in the system :param elem: :return: anpa category list qcode """ if elem.get('content') is not None: category = elem.get('content')[:1].upper() if category in {'S', 'R', 'F'}: return [{'qcode': 'S'}] if category == 'Z': return [{'qcode': 'V'}] return [{'qcode': 'I'}] def __init__(self): self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping}} super().__init__() register_feed_parser(PAFeedParser.NAME, PAFeedParser())
def parse_content_set(self, tree, item): """Parse out the nitf like content. :param tree: :param item: :return: item populated with a headline and body_html """ for content in tree.find(self.qname('contentSet')): if content.tag == self.qname('inlineXML') and content.attrib['contenttype'] == 'application/nitf+xml': nitf = content.find(self.qname('nitf')) head = nitf.find(self.qname('head')) item['headline'] = head.find(self.qname('title')).text body = nitf.find(self.qname('body')) content = self.parse_inline_content(body) item['body_html'] = content.get('content') def parse_inline_content(self, tree): body = tree.find(self.qname('body.content')) elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) content = dict() content['content'] = "\n".join(elements) return content register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
'desks').find_one(req=None, **query) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } if 'Place' in mail_item: locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get( 'Place', '').upper() ] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider) register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
# slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex) def map_priority(self, source_priority): mapping = { 'f': Priority.Flash.value, 'u': Priority.Urgent.value, 'b': Priority.Three_Paragraph.value, 'z': Priority.Ordinary.value } source_priority = source_priority.lower().strip() if isinstance(source_priority, str) else '' return mapping.get(source_priority, Priority.Ordinary.value) register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
"""Parse out the nitf like content. :param tree: :param item: :return: item populated with a headline and body_html """ for content in tree.find(self.qname('contentSet')): if content.tag == self.qname('inlineXML') and content.attrib[ 'contenttype'] == 'application/nitf+xml': nitf = content.find(self.qname('nitf')) head = nitf.find(self.qname('head')) item['headline'] = head.find(self.qname('title')).text body = nitf.find(self.qname('body')) content = self.parse_inline_content(body) item['body_html'] = content.get('content') def parse_inline_content(self, tree): body = tree.find(self.qname('body.content')) elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) content = dict() content['content'] = "\n".join(elements) return content register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) content = dict() content['contenttype'] = tree.attrib['contenttype'] content['content'] = "\n".join(elements) return content def parse_remote_content(self, tree): content = dict() content['residRef'] = tree.attrib['residref'] content['sizeinbytes'] = int(tree.attrib.get('size', '0')) content['rendition'] = tree.attrib['rendition'].split(':')[1] content['mimetype'] = tree.attrib['contenttype'] content['href'] = tree.attrib.get('href', None) return content def datetime(self, string): return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S.000Z') def get_literal_name(self, item): """Get name for item with fallback to literal attribute if name is not provided.""" name = item.find(self.qname('name')) return name.text if name is not None else item.attrib.get('literal') register_feed_parser(NewsMLTwoFeedParser.NAME, NewsMLTwoFeedParser())
}] else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] elif provider.get('source') == 'BRA': # It is from the Racing system item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] item[self.ITEM_SUBJECT] = [{ 'qcode': '15030001', 'name': subject_codes['15030001'] }] lines = item['body_html'].split('\n') if lines[2] and lines[2].find(':SPORT -') != -1: item[self.ITEM_HEADLINE] = lines[2][9:] elif lines[1] and lines[1].find('RACING : ') != -1: item[self.ITEM_HEADLINE] = lines[1][8:] elif lines[0] and lines[0].find('YY FORM') != -1: item[self.ITEM_HEADLINE] = lines[1] elif lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_HEADLINE] = lines[1][9:] else: item[self.ITEM_HEADLINE] = lines[2] return item register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)} desk = superdesk.get_resource_service('desks').find_one( req=None, **query) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} if 'Place' in mail_item: locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get('Place', '').upper()] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider) register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
raise SkipValue() def _get_slugline(self, elem): """ Capitalize the first word of the slugline (Removing any leading digits's). :param elem: :return: """ # Remove any leading numbers and split to list of words sluglineList = re.sub('^[\d.]+\W+', '', elem.text).split(' ') slugline = sluglineList[0].capitalize() if len(sluglineList) > 1: slugline = '{} {}'.format(slugline, ' '.join(sluglineList[1:])) return slugline def _get_pubstatus(self, elem): """ Mark anything that is embargoed as usable, the editorial note still describes the embargo :param elem: :return: """ return 'usable' if elem.attrib['management-status'] == 'embargoed' else elem.attrib['management-status'] def __init__(self): self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping}, 'slugline': {'xpath': 'head/title', 'filter': self._get_slugline}, 'pubstatus': {'xpath': 'head/docdata', 'filter': self._get_pubstatus}} super().__init__() register_feed_parser(PAFeedParser.NAME, PAFeedParser())
item['body_html'] = get_content(xml) item['place'] = get_places(docdata) item['keywords'] = get_keywords(docdata) if docdata.find('ed-msg') is not None: item['ednote'] = docdata.find('ed-msg').attrib.get('info') if xml.find('body/body.head/hedline/hl1') is not None: item['headline'] = xml.find('body/body.head/hedline/hl1').text else: if xml.find('head/title') is not None: item['headline'] = xml.find('head/title').text elem = xml.find('body/body.head/abstract') item['abstract'] = elem.text if elem is not None else '' elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item['byline'] = get_byline(xml) parse_meta(xml, item) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider) register_feed_parser(NITFFeedParser.NAME, NITFFeedParser())
or line.decode('latin-1', 'replace').find( 'The following information is not intended for publication') != -1: inNote = True inText = False item['ednote'] = '' continue item['body_html'] += line.decode('latin-1', 'replace') if inNote: item['ednote'] += line.decode('latin-1', 'replace') continue if inHeader: if 'slugline' not in item: item['slugline'] = '' item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n') continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(exception=ex, provider=provider) def map_category(self, source_category): if source_category == 'x' or source_category == 'X': return 'i' else: return source_category register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
"The following information is not intended for publication" ) != -1 ): inNote = True inText = False item["ednote"] = "" continue item["body_html"] += line.decode("latin-1", "replace") if inNote: item["ednote"] += line.decode("latin-1", "replace") continue if inHeader: if "slugline" not in item: item["slugline"] = "" item["slugline"] += line.decode("latin-1", "replace").rstrip("/\r\n") continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(exception=ex, provider=provider) def map_category(self, source_category): if source_category == "x" or source_category == "X": return "i" else: return source_category register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len( city.strip()) < 20: cities = find_cities() located = [ c for c in cities if c['city'].lower() == city.strip().lower() ] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city.strip(), 'city': city.strip(), 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = 'dpa' item['dateline']['text'] = city.strip() item['body_html'] = item['body_html'].replace( city + source, '', 1) break return item register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1: item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + ' ' + item.get(self.ITEM_TAKE_KEY, '') item[self.ITEM_SUBJECT] = [{'qcode': '15030000', 'name': subject_codes['15030000']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}] else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] elif provider.get('source') == 'BRA': # It is from the Racing system item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] lines = item['body_html'].split('\n') if lines[2] and lines[2].find(':SPORT -') != -1: item[self.ITEM_HEADLINE] = lines[2][9:] elif lines[1] and lines[1].find('RACING : ') != -1: item[self.ITEM_HEADLINE] = lines[1][8:] elif lines[0] and lines[0].find('YY FORM') != -1: item[self.ITEM_HEADLINE] = lines[1] elif lines[1] and lines[1].find(':POTTED :') != -1: item[self.ITEM_HEADLINE] = lines[1][9:] else: item[self.ITEM_HEADLINE] = lines[2] return item register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
# For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser from superdesk.io import register_feed_parser from superdesk.utc import utcnow from pytz import utc class AFPNewsMLOneFeedParser(NewsMLOneFeedParser): """AFP specific NewsML parser. Feed Parser which can parse the AFP feed basicaly it is in NewsML 1.2 format, but the firstcreated and versioncreated times are localised. """ NAME = 'afpnewsml12' def parse(self, xml, provider=None): item = super().parse(xml, provider) item['firstcreated'] = utc.localize( item['firstcreated']) if item.get('firstcreated') else utcnow() item['versioncreated'] = utc.localize( item['versioncreated']) if item.get( 'versioncreated') else utcnow() return item register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
def parse_news_management(self, item, entry): news_mgmt_el = entry.find(self.qname('NewsManagement', self.WENN_NM_NS)) if news_mgmt_el: item['firstcreated'] = self.datetime(self.get_elem_content( news_mgmt_el.find(self.qname('published', self.WENN_NM_NS)))) item['versioncreated'] = self.datetime(self.get_elem_content( news_mgmt_el.find(self.qname('updated', self.WENN_NM_NS)))) item['guid'] = self.get_elem_content( news_mgmt_el.find(self.qname('original_article_id', self.WENN_NM_NS))) def parse_content_management(self, item, entry): content_mgmt_el = entry.find(self.qname('ContentMetadata', self.WENN_CM_NS)) if content_mgmt_el: item['headline'] = self.get_elem_content(content_mgmt_el.find(self.qname('title', self.WENN_CM_NS))) item['abstract'] = self.get_elem_content( content_mgmt_el.find(self.qname('first_line', self.WENN_CM_NS))) item['keywords'] = [element.attrib.get('value') for element in content_mgmt_el.findall(self.qname('tags', self.WENN_CM_NS) + '/' + self.qname('tag', self.WENN_CM_NS)) if element.attrib.get('value')] def get_elem_content(self, elem): return elem.text if elem is not None else '' def datetime(self, string): return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo=utc) register_feed_parser(WENNFeedParser.NAME, WENNFeedParser())
# # Copyright 2013, 2014 Sourcefabric z.u. and contributors. # # For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser from superdesk.io import register_feed_parser from superdesk.utc import utcnow from pytz import utc class AFPNewsMLOneFeedParser(NewsMLOneFeedParser): """AFP specific NewsML parser. Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format, but the firstcreated and versioncreated times are localised. """ NAME = 'afpnewsml12' def parse(self, xml, provider=None): item = super().parse(xml, provider) item['firstcreated'] = utc.localize(item['firstcreated']) if item.get('firstcreated') else utcnow() item['versioncreated'] = utc.localize(item['versioncreated']) if item.get('versioncreated') else utcnow() return item register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get( 'original_source', 'AP') item['dateline'][ 'text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) break return item except: logging.exception('AP dateline extraction exception') register_feed_parser(AP_ANPAFeedParser.NAME, AP_ANPAFeedParser())
"""Parse dateline from item body. This function attempts to parse a dateline from the first few lines of the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :return: """ lines = item['body_html'].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20: cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.strip().lower()] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city.strip(), 'city': city.strip(), 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = 'dpa' item['dateline']['text'] = city.strip() item['body_html'] = item['body_html'].replace(city + source, '', 1) break return item register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())