def init_app(app): # register new parsers register_feed_parser(BusinessWireParser.NAME, BusinessWireParser()) register_feed_parser(GlobeNewswireParser.NAME, GlobeNewswireParser()) # override core parsers registered_feed_parsers[CP_APMediaFeedParser.NAME] = CP_APMediaFeedParser()
item['urgency'] = 5 item['pubstatus'] = 'usable' item['anpa_category'] = [{'qcode': 'e'}] item['subject'] = [{'qcode': '01000000', 'name': 'arts, culture and entertainment'}] def parse_news_management(self, item, entry): news_mgmt_el = entry.find(self.qname('NewsManagement', self.WENN_NM_NS)) if news_mgmt_el is not None: item['firstcreated'] = self.datetime(self.get_elem_content( news_mgmt_el.find(self.qname('published', self.WENN_NM_NS)))) item['versioncreated'] = self.datetime(self.get_elem_content( news_mgmt_el.find(self.qname('updated', self.WENN_NM_NS)))) item['guid'] = self.get_elem_content( news_mgmt_el.find(self.qname('original_article_id', self.WENN_NM_NS))) def parse_content_management(self, item, entry): content_mgmt_el = entry.find(self.qname('ContentMetadata', self.WENN_CM_NS)) if content_mgmt_el is not None: item['headline'] = self.get_elem_content(content_mgmt_el.find(self.qname('title', self.WENN_CM_NS))) item['abstract'] = self.get_elem_content( content_mgmt_el.find(self.qname('first_line', self.WENN_CM_NS))) def get_elem_content(self, elem): return elem.text if elem is not None else '' def datetime(self, string): return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo=utc) register_feed_parser(WENNFeedParser.NAME, WENNFeedParser())
'qcode': category, 'name': category, 'scheme': 'category' }] genre = "Nyheter" item['genre'] = [{ 'qcode': genre, 'name': genre, 'scheme': 'genre_custom' }] xhtml = [html.escape(article['biography']).replace('\n', '<br/>\n')] if photo_url is not None: label = "photo" xhtml.append('<a href="{url}">{label}</a>'.format( url=html.escape(photo_url), label=label)) item['body_html'] = '<p>{}</p>'.format('\n<br/>\n'.join(xhtml)) item['ednote'] = ( "Kilder: \n" + article['further sources'] + '\n\n' + "Fødested: {}\n".format(article['birth place']) + "Sendt inn av: {}\n".format(article['author']) + "Godkjent: {}\n".format("Ja" if article['permission'] else "Nei") + "Epost: {}\n".format(article['email']) + "Tlf: {}").format( article['phone']) item['versioncreated'] = datetime.strptime(article['DateCreated'], DATETIME_FORMAT) item['sign_off'] = '*****@*****.**' return item register_feed_parser(WufooFeedParser.NAME, WufooFeedParser())
**query) if user: return user.get('_id') raise SkipValue() def get_task(self, tree): desk_name = tree.find('head/meta[@name="aap-desk"]') if desk_name is not None: desk = superdesk.get_resource_service('desks').find_one( req=None, name=desk_name.get('content')) if desk: task = {'desk': desk.get('_id')} stage_name = tree.find('head/meta[@name="aap-stage"]') if stage_name is not None: lookup = { '$and': [{ 'name': stage_name.get('content') }, { 'desk': str(desk.get('_id')) }] } stages = superdesk.get_resource_service('stages').get( req=None, lookup=lookup) if stages is not None and stages.count() == 1: task['stage'] = stages[0].get('_id') return task raise SkipValue() register_feed_parser(NITFFeedParser.NAME, NITFFeedParser())
item['byline'] = element.text # headline element = newslines_el.find('HeadLine') if element is not None and element.text: item['headline'] = element.text.strip() # copyrightholder element = newslines_el.find('CopyrightLine') if element is not None and element.text: item['copyrightholder'] = element.text # line_type element = newslines_el.find('NewsLine/NewsLineType') if element is not None and element.get('FormalName'): item['line_type'] = element.get('FormalName') # line_text element = newslines_el.find('NewsLine/NewsLineText') if element is not None and element.text: item['line_text'] = element.text # keywords for element in newslines_el.findall('KeywordLine'): if element is not None and element.text: item.setdefault('keywords', []).append(element.text) register_feed_parser(BelgaTipNewsMLOneFeedParser.NAME, BelgaTipNewsMLOneFeedParser())
# Now need to append the issue time item['anpa_take_key'] = item['anpa_take_key'] + ' ' + time[0] def _set_headline(self, item, lines, time): city_code = lines[0][2:3] item['headline'] = item['slugline'] + ' ' + self.city_code_map.get(city_code, {}).get('state', '') +\ ': Issued ' + time[0] + ', ' + time[1] def parse(self, filename, provider=None): try: with open(filename, 'r', encoding='latin-1') as f: lines = f.readlines() item = {} time_date = self._get_time(lines) self.set_item_defaults(item, filename) self._set_slugline(item, lines, provider) self._set_take_key(item, lines, time_date) self._set_headline(item, lines, time_date) item['body_html'] = '<pre>' + ''.join(lines[1:]) + '</pre>' return item except Exception as ex: logging.exception(ex) try: register_feed_parser(BOMParser.NAME, BOMParser()) except AlreadyExistsError: pass
item['authors'].append({ 'uri': creator.get('uri'), 'role': role.text, }) def _get_data_subject(self, subject_elt): qcode_parts = subject_elt.get('qcode', '').split(':') if len(qcode_parts ) == 2 and qcode_parts[0] in self.SUBJ_QCODE_PREFIXES: scheme = self.SUBJ_QCODE_PREFIXES[qcode_parts[0]] if scheme: # we use the given name if it exists name_elt = subject_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary(scheme, qcode_parts[1], name) subject_data = { 'qcode': qcode_parts[1], 'name': name, "scheme": scheme } return subject_data except ValueError: logger.info('Subject element rejected for "{code}"'.format( code=qcode_parts[1])) return None register_feed_parser(BelgaDPANewsMLTwoFeedParser.NAME, BelgaDPANewsMLTwoFeedParser())
if item.get(FORMAT) == FORMATS.PRESERVED: item['body_html'] = '<pre>' + html.escape(item['body_html']) + '</pre>' return self.post_process_item(item, provider) except Exception as ex: raise AAPParserError.ZCZCParserError(exception=ex, provider=provider) def set_item_defaults(self, item, provider): item['urgency'] = 5 item['pubstatus'] = 'usable' item['versioncreated'] = utcnow() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item[FORMAT] = FORMATS.HTML def post_process_item(self, item, provider): """ Applies the transormations required based on the provider of the content and the item it's self :param item: :param provider: :return: item """ return item try: register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser()) except AlreadyExistsError: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
def set_item_defaults(self, item, filename): item['guid'] = filename + ':' + str(uuid.uuid4()) item['urgency'] = 5 item['pubstatus'] = 'usable' item['versioncreated'] = utcnow() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item[FORMAT] = FORMATS.HTML def datetime(self, string): """ Convert the date string parsed from the source file to a datetime, assumes that the time is local to Sydney Australia :param string: :return: """ # 06 June 2016 14:00:00 local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S') local_tz = pytz.timezone('Australia/Sydney') aus_dt = local_tz.localize(local_dt, is_dst=None) return aus_dt.astimezone(pytz.utc) try: register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.NewsBitesParserError().get_error_description())
# private editorial note try: private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if private_note: item.setdefault('extra', {})['sttnote_private'] = private_note return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider) def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content register_feed_parser(STTNewsMLFeedParser.NAME, STTNewsMLFeedParser())
def parse_content_set(self, tree, item): """Parse out the nitf like content. :param tree: :param item: :return: item populated with a headline and body_html """ for content in tree.find(self.qname('contentSet')): if content.tag == self.qname('inlineXML') and content.attrib['contenttype'] == 'application/nitf+xml': nitf = content.find(self.qname('nitf')) head = nitf.find(self.qname('head')) item['headline'] = head.find(self.qname('title')).text body = nitf.find(self.qname('body')) content = self.parse_inline_content(body) item['body_html'] = content.get('content') def parse_inline_content(self, tree): body = tree.find(self.qname('body.content')) elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) content = dict() content['content'] = "\n".join(elements) return content register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
:return: """ keywords = self.get_keywords(docdata) return keywords[0] if len(keywords) > 0 else None def get_subjects(self, tree): """Finds all the IPTC subject tags in the passed tree and returns the parsed subjects. All entries will have both the name and qcode populated. :param tree: :return: a list of subject dictionaries """ subjects = [] qcodes = [] # we check qcodes to avoid duplicates for elem in tree.findall('head/tobject/tobject.subject[@tobject.subject.ipr="IPTC"]'): qcode = elem.get('tobject.subject.refnum') if qcode in qcodes: # we ignore duplicates continue else: qcodes.append(qcode) # if the subject_fields are not specified. if not any(c['qcode'] == qcode for c in subjects) and subject_codes.get(qcode): subjects.append({'name': subject_codes[qcode], 'qcode': qcode}) return subjects register_feed_parser(EFEFeedParser.NAME, EFEFeedParser())
byline = item.get('byline') or '' if byline: byline_prefix = '' if not byline.startswith('By '): byline_prefix = 'By ' byline_found = elem_text.lower().startswith('{}{}'.format(byline_prefix, byline).lower()) else: byline_found = elem_text.startswith('By ') if byline_found: item['byline'] = elem_text # remove the byline from the body text if not byline_found: elements.append('<%s>%s</%s>' % (tag, elem_text, tag)) line_counter += 1 content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(elements) > 0: content['content'] = "\n".join(elements) elif body.text: content['content'] = '<pre>' + body.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content register_feed_parser(ReutersNewsMLTwoFeedParser.NAME, ReutersNewsMLTwoFeedParser()) register_feeding_service_parser(ReutersHTTPFeedingService.NAME, ReutersNewsMLTwoFeedParser.NAME)
# AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license*. from superdesk.io.registry import register_feed_parser from .text_file import TextFileParser from superdesk.errors import AlreadyExistsError import time class TickerFileParser(TextFileParser): """ A simple parser for ticker files, the headline gives an indication it is an AAP ticker story. The body of the story is the content for the ticker. """ NAME = 'AAP Ticker File' def parse(self, filename, provider=None): item = super().parse(filename, provider) item['headline'] = 'AAP Ticker on {}'.format(time.strftime("%A %H:%M:%S", time.localtime())) return item def post_process_item(self, item): item['headline'] = item['headline'][:40] return item try: register_feed_parser(TickerFileParser.NAME, TickerFileParser()) except AlreadyExistsError: pass
x['qcode'] == 'Results (sport)' and x['is_active']] self.truncate_fields(item) return item except Exception as ex: logging.exception(ex) def truncate_fields(self, item): """ Given an item it will truncate the headline and slugline to the lengths defined in the auto publish validation schema :param item: :return: """ lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT} validators = get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] if 'headline' in item: item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'slugline' in item: item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] try: register_feed_parser(PDAResultsParser.NAME, PDAResultsParser()) except AlreadyExistsError: pass
return item def _format_qcodes(self, items): return [{'name': item.get('name'), 'qcode': item.get('code')} for item in items] def datetime(self, string): try: return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+0000') except ValueError: return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=utc) def _parse_authors(self, authors): return [self._parse_author(author) for author in authors] def _parse_author(self, author): parsed = { 'name': author['name'], 'role': author.get('role', ''), } if author.get('avatar_url'): parsed['avatar_url'] = author['avatar_url'] if author.get('biography'): parsed['biography'] = author['biography'] return parsed register_feed_parser(NINJSFeedParser.NAME, NINJSFeedParser())
'desks').find_one(req=None, **query) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } if 'Place' in mail_item: locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get( 'Place', '').upper() ] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider) register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
superdesk.privilege(name='planning_event_spike', label='Planning - Spike Event Items', description='Ability to spike an Event') superdesk.privilege(name='planning_event_unspike', label='Planning - Unspike Event Items', description='Ability to unspike an Event') superdesk.intrinsic_privilege(PlanningUnlockResource.endpoint_name, method=['POST']) superdesk.intrinsic_privilege(EventsUnlockResource.endpoint_name, method=['POST']) import planning.output_formatters # noqa app.client_config['max_recurrent_events'] = get_max_recurrent_events(app) register_feeding_service(EventFileFeedingService.NAME, EventFileFeedingService(), EventFileFeedingService.ERRORS) register_feeding_service(EventHTTPFeedingService.NAME, EventHTTPFeedingService(), EventHTTPFeedingService.ERRORS) register_feeding_service(EventEmailFeedingService.NAME, EventEmailFeedingService(), EventEmailFeedingService.ERRORS) register_feed_parser(IcsTwoFeedParser.NAME, IcsTwoFeedParser()) register_feed_parser(NTBEventXMLFeedParser.NAME, NTBEventXMLFeedParser())
the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :return: """ lines = item["body_html"].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(" (dpa) - ") # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20: cities = app.locators.find_cities() located = [c for c in cities if c["city"].lower() == city.strip().lower()] if "dateline" not in item: item["dateline"] = {} item["dateline"]["located"] = ( located[0] if len(located) > 0 else {"city_code": city.strip(), "city": city.strip(), "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = "dpa" item["dateline"]["text"] = city.strip() item["body_html"] = item["body_html"].replace(city + source, "", 1) break return item register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
} elif self.missing_voc == "continue": return name else: raise RuntimeError("Unexpected missing_voc value: {}".format( self.missing_voc)) try: items = voc["items"] except KeyError: logger.warning( "Creating missing items for {qcode}".format(qcode=qcode)) voc["items"] = items = [] for item in items: if item["qcode"] == qcode: if item.get("is_active", True): return item.get("name", name) else: # the vocabulary exists but is disabled raise ValueError items.append({"is_active": True, "name": name, "qcode": qcode}) if create: vocabularies_service.post([voc]) else: vocabularies_service.put(voc_id, voc) return name register_feed_parser(NewsMLTwoFeedParser.NAME, NewsMLTwoFeedParser())
from aap.errors import AAPParserError import superdesk class ZCZCSportsResultsParser(ZCZCFeedParser): NAME = 'Sportsresults_zczc' def set_item_defaults(self, item, provider): super().set_item_defaults(item, provider) item['original_source'] = 'Sports Results' def post_process_item(self, item, provider): genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre') item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Results (sport)' and x['is_active']] # If the format is HTML we need to convert the content if item[FORMAT] == FORMATS.HTML: item['body_html'] = '</p><p>'.join(item['body_html'].split('\n\n')) item['body_html'] = item['body_html'].replace('\n', '<br>').replace('\t', '') item['body_html'] = '<p>' + item['body_html'] + '</p>' return item try: register_feed_parser(ZCZCSportsResultsParser.NAME, ZCZCSportsResultsParser()) except AlreadyExistsError: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:]) # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to # h if (item.get(self.ITEM_SLUGLINE, '') + item.get(self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # Another exception if 'NZ/AUST FIELDS' in item.get('body_html', ''): item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}] # if the item has been marked as convert to HTML then we need to use the racing reformat macro # to convert it. if lines[0] and lines[0].find('HH ') != -1: racing_reformat_macro(item) genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre') if genre_map: item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Racing Data' and x['is_active']] return item except Exception as ex: logger.exception(ex) try: register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
:param elem: :return: """ # Remove any leading numbers and split to list of words sluglineList = re.sub(r'^[\d.]+\W+', '', elem.text).split(' ') slugline = sluglineList[0].capitalize() if len(sluglineList) > 1: slugline = '{} {}'.format(slugline, ' '.join(sluglineList[1:])) return slugline def _get_pubstatus(self, elem): """Mark anything that is embargoed as usable, the editorial note still describes the embargo. :param elem: :return: """ return 'usable' if elem.attrib['management-status'] == 'embargoed' else elem.attrib['management-status'] def __init__(self): self.MAPPING = {'anpa_category': {'xpath': "head/meta[@name='category']", 'filter': self._category_mapping}, 'slugline': {'xpath': 'head/title', 'filter': self._get_slugline}, 'pubstatus': {'xpath': 'head/docdata', 'filter': self._get_pubstatus}} super().__init__() def parse(self, xml, provider=None): self.xml = xml return super().parse(xml, provider=provider) register_feed_parser(PAFeedParser.NAME, PAFeedParser())
date = date_parser(dateline, fuzzy=True).replace(tzinfo=utc) item['dateline']['date'] = date item['dateline']['source'] = source[:-4].strip() item['dateline']['text'] = dateline.strip() # Attempt to set the city data to the dateline.location key cities = app.locators.find_cities() for city in dateline.replace(' and ', ',').split(','): located = [c for c in cities if c['city'].lower() == city.strip().lower()] if len(located) > 0: item['dateline']['located'] = located[0] break if 'located' not in item['dateline']: city = dateline.split(',')[0] item['dateline']['located'] = { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } try: register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.AsiaNetParserError().get_error_description())
return dateutil.parser.parse(value) def _subject_filter(self, qcode): try: subject = self.subjects_map[qcode] except KeyError: return None else: if not subject.get('is_active', False): return None name = subject.get('name', '') return {'qcode': qcode, 'name': name, 'scheme': 'subject_custom'} def _publish_date_filter(self, date_string): local = dateutil.parser.parse(date_string) return local_to_utc(self.TIMEZONE, local) def _set_headline(self, item, value): if not value: # if there is no headline, we use first 100 chars of body # cf. SDNTB-481 value = text_utils.get_text(item.get('body_html', ''), 'html')[:100] item['headline'] = value def _ednote_filter(self, ednote): return text_utils.get_text(ednote, lf_on_block=True).strip() register_feed_parser(RitzauFeedParser.NAME, RitzauFeedParser())
keywords = self.get_keywords(docdata) return keywords[0] if len(keywords) > 0 else None def get_subjects(self, tree): """Finds all the IPTC subject tags in the passed tree and returns the parsed subjects. All entries will have both the name and qcode populated. :param tree: :return: a list of subject dictionaries """ subjects = [] qcodes = [] # we check qcodes to avoid duplicates for elem in tree.findall( 'head/tobject/tobject.subject[@tobject.subject.ipr="IPTC"]'): qcode = elem.get('tobject.subject.refnum') if qcode in qcodes: # we ignore duplicates continue else: qcodes.append(qcode) # if the subject_fields are not specified. if not any(c['qcode'] == qcode for c in subjects) and subject_codes.get(qcode): subjects.append({'name': subject_codes[qcode], 'qcode': qcode}) return subjects register_feed_parser(EFEFeedParser.NAME, EFEFeedParser())
item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}] item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}] elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}] item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}] self._set_results_genre(item) else: item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}] item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] # truncate the slugline to the length defined in the validation schema lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT} validators = get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] if 'slugline' in item: item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] return item except Exception as ex: logger.exception(ex) try: register_feed_parser(ZCZCPMFParser.NAME, ZCZCPMFParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
if line.decode('latin-1', 'replace')\ .find('The following information is not for publication') != -1 \ or line.decode('latin-1', 'replace').find( 'The following information is not intended for publication') != -1: inNote = True inText = False item['ednote'] = '' continue item['body_html'] += line.decode('latin-1', 'replace') if inNote: item['ednote'] += line.decode('latin-1', 'replace') continue if inHeader: if 'slugline' not in item: item['slugline'] = '' item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n') continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(exception=ex, provider=provider) def map_category(self, source_category): if source_category == 'x' or source_category == 'X': return 'i' else: return source_category register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
item['guid'] = filename + str(uuid.uuid4()) def parse(self, filename, provider=None): """ Attempt to parse the text file and return the item :param filename: :param provider: :return: """ try: with open(filename, 'r', encoding='latin-1') as f: lines = f.readlines() item = {} self.set_item_defaults(item, filename) text = StringIO() if len(lines) > 0: item['headline'] = lines[0].strip() for line in lines: text.write(line) item['body_html'] = '<pre>' + html.escape(text.getvalue()) + '</pre>' return item except Exception as ex: logging.exception(ex) try: register_feed_parser(TextFileParser.NAME, TextFileParser()) except AlreadyExistsError as ex: pass
'subject')) # check for sports using all ingested subjects item['subject'] = filter_missing_subjects(item.get('subject')) item['subject'].append(category) urgency = item.get('urgency', None) if urgency == 2: item['urgency'] = 3 elif urgency == 4: item['urgency'] = 5 set_default_service(item) if not item.get('headline') and item.get('body_html'): first_line = item.get('body_html').strip().split('\n')[0] parsed_headline = etree.parse_html(first_line, 'html') item['headline'] = etree.to_string( parsed_headline, method="text").strip().split('\n')[0] return item def parse_newslines(self, item, tree): super().parse_newslines(item, tree) newsline_type = tree.find( 'NewsItem/NewsComponent/NewsLines/NewsLine/NewsLineType[@FormalName="AdvisoryLine"]' ) if newsline_type is not None and newsline_type.getnext() is not None: item['ednote'] = newsline_type.getnext().text or '' register_feed_parser(NTBAFPNewsMLParser.NAME, NTBAFPNewsMLParser())
# just the video html tag with the source as the # video file path body = ''' <br> <video controls="" height=400 width=500 src="%s"></video> <br> ''' % video_path # create new file so that the file feeding service won't complain cmd = 'touch %s' % file_path cmd = shlex.split(cmd) output = subprocess.check_output(cmd) item = {} guid = str(uuid4()) item = { 'body_html': body, 'headline': headline, 'type': 'text', 'versioncreated': utcnow(), 'guid': guid } return item register_feed_parser(SimpleVideoParser.NAME, SimpleVideoParser())
item[self.ITEM_SLUGLINE] = lines[1][:(lines[1].find(' Comment ') + 8)] item[self.ITEM_TAKE_KEY] = lines[1][(lines[1].find(' Comment ') + 9):] else: self._scan_lines(item, lines) # Truncate the slugline and headline to the lengths defined on the validators if required lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] if self.ITEM_SLUGLINE in item and len(item[self.ITEM_SLUGLINE]) > max_slugline_len: # the overflow of the slugline is dumped in the take key item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_SLUGLINE)[max_slugline_len:] item[self.ITEM_SLUGLINE] = item[self.ITEM_SLUGLINE][:max_slugline_len] if self.ITEM_HEADLINE in item: item[self.ITEM_HEADLINE] = item[self.ITEM_HEADLINE][:max_headline_len] \ if len(item[self.ITEM_HEADLINE]) > max_headline_len else item[self.ITEM_HEADLINE] return item except Exception as ex: logger.exception(ex) try: register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)} desk = superdesk.get_resource_service('desks').find_one( req=None, **query) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} if 'Place' in mail_item: locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get('Place', '').upper()] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider) register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
content_type, rendition_spec, url_for_media) item['renditions'] = renditions try: date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED] except KeyError: pass else: # we format proper ISO 8601 date so we can parse it with dateutil datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4], date_created[4:6], date_created[6:8], time_created[0:2], time_created[2:4], time_created[4:6], time_created[6], time_created[7:9], time_created[9:]) item['firstcreated'] = dateutil.parser.parse(datetime_created) # now we map IPTC metadata to superdesk metadata for source_key, dest_key in IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item register_feed_parser(ImageIPTCFeedParser.NAME, ImageIPTCFeedParser())
elt.tag = 'blockquote' elif tag == 'MELLIS': elt.tag = 'h2' elif tag == 'FRAGA': elt.tag = 'p' elif tag == 'SVAR': elt.tag = 'p' elif tag == 'UL': elt.tag = 'ul' elif tag == 'LI': elt.tag = 'li' elif tag == 'TABELL': elt.tag = 'table' elif tag == 'TH': elt.tag = 'th' elif tag == 'TR': elt.tag = 'tr' elif tag == 'TD': elt.tag = 'td' else: logger.warning('unknown tag: {tag}'.format(tag=tag)) elt.tag = 'p' div_elt = etree.Element('div') div_elt[:] = body_elt[:] contents = [etree.tostring(e, encoding='unicode', method='html') for e in div_elt] return {'content': '\n'.join(contents)} register_feed_parser(TTNewsMLFeedParser.NAME, TTNewsMLFeedParser())
def get_datetime(self, value): return dateutil.parser.parse(value) def _subject_filter(self, qcode): try: subject = self.subjects_map[qcode] except KeyError: return None else: if not subject.get('is_active', False): return None name = subject.get('name', '') return {'qcode': qcode, 'name': name, 'scheme': 'subject_custom'} def _publish_date_filter(self, date_string): dt = dateutil.parser.parse(date_string) return dt.replace(tzinfo=timezone('CET')) def _set_headline(self, item, value): if not value: # if there is no headline, we use first 100 chars of body # cf. SDNTB-481 value = text_utils.get_text(item.get('body_html', ''), 'html')[:100] item['headline'] = value register_feed_parser(RitzauFeedParser.NAME, RitzauFeedParser())
if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len( city.strip()) < 20: cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.strip().lower() ] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city.strip(), 'city': city.strip(), 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = 'dpa' item['dateline']['text'] = city.strip() item['body_html'] = item['body_html'].replace( city + source, '', 1) break return item register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
localities = [l for l in self.localityHierarchy if stadium.raw.get('address', {}).get(l)] areas = [a for a in self.areaHierarchy if stadium.raw.get('address', {}).get(a)] line = stadium.raw.get('address', {}).get('house_number', '') line = stadium.raw.get('address', {}).get('road', '') if line == '' else \ line + ' ' + stadium.raw.get('address', {}).get('road', '') location['address'] = { 'locality': stadium.raw.get('address', {}).get(localities[0], '') if len(localities) > 0 else '', 'area': stadium.raw.get('address', {}).get(areas[0], '') if len(areas) > 0 else '', 'country': stadium.raw.get('address', {}).get('country', ''), 'postal_code': stadium.raw.get('address', {}).get('postcode', ''), 'external': {'nominatim': stadium.raw}, 'line': [line] } location['name'] = stadiums[0].raw.get('address', {}).get(stadiums[0].raw.get('type', 'stadium'), '') ret = locations_service.post([location]) location = locations_service.find_one(req=None, _id=ret[0]) item['location'] = [{ 'name': location.get('name', location.get('name')), 'address': { 'line': location.get('address', {}).get('line', []), 'area': location.get('address', {}).get('area', ''), 'locality': location.get('address', {}).get('locality', ''), 'postal_code': location.get('address', {}).get('postal_code', ''), 'country': location.get('address', {}).get('country', ''), }, 'qcode': location.get('guid') }] register_feed_parser(AAPSportsFixturesParser.NAME, AAPSportsFixturesParser())
stadium.raw.get('address', {}).get(areas[0], '') if len(areas) > 0 else '', 'country': stadium.raw.get('address', {}).get('country', ''), 'postal_code': stadium.raw.get('address', {}).get('postcode', ''), 'external': { 'nominatim': stadium.raw }, 'line': [line] } location['name'] = stadiums[0].raw.get('address', {}).get( stadiums[0].raw.get('type', 'stadium'), '') ret = locations_service.post([location]) location = locations_service.find_one(req=None, _id=ret[0]) item['location'] = [{ 'name': location.get('name', location.get('name')), 'address': { 'line': location.get('address', {}).get('line', []), 'area': location.get('address', {}).get('area', ''), 'locality': location.get('address', {}).get('locality', ''), 'postal_code': location.get('address', {}).get('postal_code', ''), 'country': location.get('address', {}).get('country', ''), }, 'qcode': location.get('guid') }] register_feed_parser(AAPSportsFixturesParser.NAME, AAPSportsFixturesParser())
if json.get('embargotime'): main['embargo'] = json['embargotime'] main['type'] = self._convert_type(json['type']) return main def _parse_date(self, string): """Attempts to parse BBC ninjs time in format YYYY-MM-DDTHH:MM:SS :param string: :return: datetime """ return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S') def _convert_type(self, content_type): """Attempts to convert BBC's types to standard ninjs types :param content_type: :return: """ if content_type == 'image': return CONTENT_TYPE.PICTURE if content_type == 'story' or content_type == 'advisory': return CONTENT_TYPE.TEXT logger.error("could not find content type ({}), defaulting to text".format(content_type)) return CONTENT_TYPE.TEXT register_feed_parser(BBCNINJSFeedParser.NAME, BBCNINJSFeedParser())
:param dict item: The item where the data will be stored :param str header: The header of the file """ source = 'anpa_take_key' for line in header.split('\n'): if line.lower().startswith('media release'): break if source not in item: item[source] = line else: item[source] += line # Clean up the header entries item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n', '').strip() item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '') item['slugline'] = 'AAP Medianet' self._truncate_headers(item) try: register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.AsiaNetParserError().get_error_description())
'qcode': '04000000', 'name': subject_codes['04000000'] }] item[FORMAT] = FORMATS.HTML def datetime(self, string): """ Convert the date string parsed from the source file to a datetime, assumes that the time is local to Sydney Australia :param string: :return: """ # 06 June 2016 14:00:00 try: local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S') except ValueError: local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S') local_tz = pytz.timezone('Australia/Sydney') aus_dt = local_tz.localize(local_dt, is_dst=None) return aus_dt.astimezone(pytz.utc) try: register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.NewsBitesParserError().get_error_description())
place_strs = item.pop('place').split(' ') for place in place_strs: if place in self.place_map: replace = [ x for x in locator_map.get('items', []) if x['qcode'] == self.place_map.get(place, '').upper() ] if replace is not None: item[self.ITEM_PLACE] = replace if place in self.subject_map: if item.get(self.ITEM_SUBJECT) is None: item[self.ITEM_SUBJECT] = [] item['subject'].append({ 'qcode': self.subject_map.get(place), 'name': subject_codes[self.subject_map.get(place)] }) return item try: register_feed_parser(ZCZCMedianetParser.NAME, ZCZCMedianetParser()) except AlreadyExistsError as ex: pass register_feeding_service_error( 'file', AAPParserError.ZCZCParserError().get_error_description())
'qcode': qcode, 'name': sport, 'scheme': 'subject_custom' }) service = {'qcode': SERVICE_QCODE, 'name': self.service_name} item = { 'guid': event['uid'], ITEM_TYPE: CONTENT_TYPE.EVENT, 'dates': { 'start': event_start, 'end': event_end, 'tz': '' }, 'name': name, 'slugline': sport, 'subject': subject, 'anpa_category': [service], 'calendars': [self.calendar_item], 'firstcreated': utcnow(), 'versioncreated': utcnow() } items.append(item) return items except Exception as ex: raise ParserError.parseMessageError(ex, provider) register_feed_parser(NTBNIFSFeedParser.NAME, NTBNIFSFeedParser())
item['body_html'] = '<p>{}</p>'.format( re.sub('<p> ', '<p>', item.get('body_html', '').replace('\n\n', '\n').replace('\n', '</p><p>'))) if self.ITEM_PLACE in item: if item[self.ITEM_PLACE]: item['headline'] = '{}: {}'.format(item[self.ITEM_PLACE], item.get(self.ITEM_HEADLINE, '')) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == item.get(self.ITEM_PLACE, '').upper()] if place is not None: item[self.ITEM_PLACE] = place else: item.pop(self.ITEM_PLACE) genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre') item['genre'] = [x for x in genre_map.get('items', []) if x['qcode'] == 'Broadcast Script' and x['is_active']] # Remove the attribution item['body_html'] = item.get('body_html', '').replace('<p>AAP RTV</p>', '') item['sign_off'] = 'RTV' except Exception as ex: logger.exception(ex) return item try: register_feed_parser(ZCZCBOBParser.NAME, ZCZCBOBParser()) except AlreadyExistsError as ex: pass register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
return True def format_subjects(self, subjects): """Map the ingested Subject Codes to their corresponding names as per IPTC Specification. :param subjects: list of dicts where each dict gives the category the article is mapped to. :type subjects: list :returns [{"qcode": "01001000", "name": "archaeology"}, {"qcode": "01002000", "name": "architecture"}] :rtype list """ formatted_subjects = [] def is_not_formatted(qcode): for formatted_subject in formatted_subjects: if formatted_subject['qcode'] == qcode: return False return True for subject in subjects: formal_name = subject.get('FormalName') if formal_name and is_not_formatted(formal_name): formatted_subjects.append({'qcode': formal_name, 'name': subject_codes.get(formal_name, '')}) return formatted_subjects register_feed_parser(NewsMLOneFeedParser.NAME, NewsMLOneFeedParser())
def parse(self, filename, provider=None): default_item = self._set_default_item() items = [] with open(filename, 'r', encoding='UTF-8') as f: csv_reader = csv.reader(f) for row in list(csv_reader)[1:]: if not len(row): continue item = deepcopy(default_item) item[GUID_FIELD] = ('urn:www.abs.gov.au:' + row[0].split(' ')[0] + row[0].split(',')[-1]).replace( '/', '-').replace(' ', '-') if row[5] == 'true': start = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M') end = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M') item['dates'] = { 'start': local_to_utc(config.DEFAULT_TIMEZONE, start), 'end': local_to_utc(config.DEFAULT_TIMEZONE, end), 'tz': config.DEFAULT_TIMEZONE, } item['name'] = ' '.join(row[0].split(' ')[1:]) item['definition_short'] = row[0] items.append(item) return items register_feed_parser(ABSCalendarCSVParser.NAME, ABSCalendarCSVParser())
html = etree.tostring(content, encoding="unicode") item['body_html'] = html def attachments_hook(self, item, attachments): """Attachment are parsed at the end if it's the first image found, it's used as feature media else it's used as embed and put at the end of body_html """ for url in attachments: try: key, media_data = self._add_image(item, url) except Exception as e: logger.error(e) continue if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = "<!--" + embed_TPL.format('START', key) + "-->" embed_end = "<!--" + embed_TPL.format('END', key) + "-->" new_url = media_data['renditions']['original']['href'] img = '<img src={src} height="{height}" width="{width}">'.format( src=quoteattr(new_url), height=media_data['renditions']['original']['height'], width=media_data['renditions']['original']['width']) item['body_html'] += '<div>' + embed_start + img + embed_end + '</div>' register_feed_parser(WPWXRFeedParser.NAME, WPWXRFeedParser())
return self.post_process_item(item, provider) except Exception as ex: raise AAPParserError.ZCZCParserError(exception=ex, provider=provider) def set_item_defaults(self, item, provider): item['urgency'] = 5 item['pubstatus'] = 'usable' item['versioncreated'] = utcnow() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item[FORMAT] = FORMATS.HTML def post_process_item(self, item, provider): """ Applies the transormations required based on the provider of the content and the item it's self :param item: :param provider: :return: item """ return item try: register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser()) except AlreadyExistsError: pass register_feeding_service_error( 'file', AAPParserError.ZCZCParserError().get_error_description())
item['body_html'] = html def attachments_hook(self, item, attachments): """Attachment are parsed at the end if it's the first image found, it's used as feature media else it's used as embed and put at the end of body_html """ for url in attachments: try: key, media_data = self._add_image(item, url) except Exception as e: logger.error(e) continue if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = "<!--" + embed_TPL.format('START', key) + "-->" embed_end = "<!--" + embed_TPL.format('END', key) + "-->" _id = media_data['_id'] new_url = url_for_media(_id) img = '<img src={src} height="{height}" width="{width}">'.format( src=quoteattr(new_url), height=media_data['renditions']['original']['height'], width=media_data['renditions']['original']['width']) item[ 'body_html'] += '<div>' + embed_start + img + embed_end + '</div>' register_feed_parser(WPWXRFeedParser.NAME, WPWXRFeedParser())
# at https://www.sourcefabric.org/superdesk/license from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser from superdesk.io.registry import register_feed_parser from superdesk.utc import utcnow from pytz import utc class AFPNewsMLOneFeedParser(NewsMLOneFeedParser): """AFP specific NewsML parser. Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format, but the firstcreated and versioncreated times are localised. """ NAME = "afpnewsml12" label = "AFP News ML 1.2 Parser" def parse(self, xml, provider=None): item = super().parse(xml, provider) item["firstcreated"] = utc.localize( item["firstcreated"]) if item.get("firstcreated") else utcnow() item["versioncreated"] = utc.localize( item["versioncreated"]) if item.get( "versioncreated") else utcnow() return item register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
item['priority'] = item['urgency'] item['byline'] = ', '.join(article.get('authors', [])) for category_qcode in [ c for c in article.get('categories', []) if c in self._vocabularies['anp_genres'] ]: item.setdefault('subject', []).append({ 'name': self._vocabularies['anp_genres'][category_qcode]['name'], 'qcode': category_qcode, 'scheme': 'anp_genres' }) for keyword in article.get('keywords', []): item.setdefault('keywords', []).append(keyword) # fetch media if item contains a media_link if article.get('media_link'): self._add_featuremedia(provider, item, article['media_link']) return item def _parse_date(self, string): return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ') register_feed_parser(ANPNewsApiFeedParser.NAME, ANPNewsApiFeedParser())
return item def parse_date_time(self, date, time): if not date or not time: return datetime_string = '{}T{}'.format(date, time) try: return datetime.strptime(datetime_string, self.DATETIME_FORMAT) except ValueError: try: arrow.get(datetime_string).datetime except ValueError: return def parse_meta(self, item, metadata): datetime_created = self.parse_date_time(metadata.get(TAG.DATE_CREATED), metadata.get(TAG.TIME_CREATED)) if datetime_created: item['firstcreated'] = datetime_created # now we map IPTC metadata to superdesk metadata for source_key, dest_key in self.IPTC_MAPPING.items(): try: item[dest_key] = metadata[source_key] except KeyError: continue return item register_feed_parser(ImageIPTCFeedParser.NAME, ImageIPTCFeedParser())
Feed Parser which can parse STT variant of NewsML """ NAME = 'ntb_sttnewsml' label = "NTB STT NewsML" def can_parse(self, xml): return xml.tag.endswith('newsItem') def parse(self, xml, provider=None): try: item = super().parse(xml, provider)[0] # SDNTB-462 requires that slugline is removed del item['slugline'] sport = bool( self.root.xpath( '//iptc:subject[@type="cpnat:abstract" and @qcode="sttsubj:15000000"]', namespaces={'iptc': IPTC_NS})) cat = utils.SPORT_CATEGORY if sport else utils.DEFAULT_CATEGORY category = {'qcode': cat, 'name': cat, 'scheme': 'category'} item['subject'] = utils.filter_missing_subjects( item.get('subject')) item['subject'].append(category) utils.set_default_service(item) return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider) register_feed_parser(NTBSTTNewsMLFeedParser.NAME, NTBSTTNewsMLFeedParser())
m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote self._parse_ednote(header_lines, item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex) def _parse_ednote(self, header_lines, item): for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() def map_priority(self, source_priority): mapping = { 'f': Priority.Flash.value, 'u': Priority.Urgent.value, 'b': Priority.Three_Paragraph.value, 'z': Priority.Ordinary.value } source_priority = source_priority.lower().strip() if isinstance(source_priority, str) else '' return mapping.get(source_priority, Priority.Ordinary.value) register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
item['calendars'] = [c for c in calendars.get('items', []) if c.get('qcode').lower() == 'abs statistics'] return item def parse(self, filename, provider=None): default_item = self._set_default_item() items = [] with open(filename, 'r', encoding='UTF-8') as f: csv_reader = csv.reader(f) for row in list(csv_reader)[1:]: if not len(row): continue item = deepcopy(default_item) item[GUID_FIELD] = ('urn:www.abs.gov.au:' + row[0].split(' ')[0] + row[0].split(',')[-1]).replace('/', '-').replace(' ', '-') if row[5] == 'true': start = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M') end = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M') item['dates'] = { 'start': local_to_utc(config.DEFAULT_TIMEZONE, start), 'end': local_to_utc(config.DEFAULT_TIMEZONE, end), 'tz': config.DEFAULT_TIMEZONE, } item['name'] = ' '.join(row[0].split(' ')[1:]) item['definition_short'] = row[0] items.append(item) return items register_feed_parser(ABSCalendarCSVParser.NAME, ABSCalendarCSVParser())
if m: item["slugline"] = m.group(1) # ednote self._parse_ednote(header_lines, item) return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex) def _parse_ednote(self, header_lines, item): for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item["ednote"] = m.group(1).strip() def map_priority(self, source_priority): mapping = { "f": Priority.Flash.value, "u": Priority.Urgent.value, "b": Priority.Three_Paragraph.value, "z": Priority.Ordinary.value, } source_priority = source_priority.lower().strip() if isinstance( source_priority, str) else "" return mapping.get(source_priority, Priority.Ordinary.value) register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
# # For the full copyright and license information, please see the # AUTHORS and LICENSE files distributed with this source code, or # at https://www.sourcefabric.org/superdesk/license from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser from superdesk.io.registry import register_feed_parser from superdesk.utc import utcnow from pytz import utc class AFPNewsMLOneFeedParser(NewsMLOneFeedParser): """AFP specific NewsML parser. Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format, but the firstcreated and versioncreated times are localised. """ NAME = 'afpnewsml12' label = 'AFP News ML 1.2 Parser' def parse(self, xml, provider=None): item = super().parse(xml, provider) item['firstcreated'] = utc.localize(item['firstcreated']) if item.get('firstcreated') else utcnow() item['versioncreated'] = utc.localize(item['versioncreated']) if item.get('versioncreated') else utcnow() return item register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())