def update_to_pass_validation(item, **kwargs): """ This is a test macro that does what is required to ensure that a text item will pass publication validation. It is intended to be used to test auto publishing, that is publishing directly from ingest. At the moment virtually all content received from Reuters fails validation. :param item: :param kwargs: :return: """ try: lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'dateline' not in item: cities = app.locators.find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located: item['dateline'] = {'date': item['firstcreated'], 'located': located[0]} item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']), source=item['source']) return item except: logging.exception('Test update to pass validation macro exception')
def test_format_dateline_to_format_when_city_state_and_country_are_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() located['dateline'] = "city,state,country" formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, 'SYDNEY, NSW, AU %s %s -' % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if "anpa_category" not in item: category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories") if category_map: map_entry = next( (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None ) item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}] if "subject" not in item: qcode = "01000000" item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}] cities = find_cities(country_code="AU", state_code="NSW") located = [c for c in cities if c["city"].lower() == "sydney"] if located and "dateline" not in item: item["dateline"] = {"date": item["firstcreated"], "located": located[0]} item["dateline"]["source"] = item["source"] item["dateline"]["text"] = format_dateline_to_locmmmddsrc( located[0], get_date(item["firstcreated"]), source=item["source"] ) return item except Exception as ex: logger.exception(ex)
def ap_derive_dateline(self, item): """ This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll('p') for par in pars: city, source, the_rest = par.get_text().partition(' (AP) _ ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'AP') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) break return item except: logging.exception('AP dateline extraction exception')
def derive_dateline(self, item): """ Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we have the correct country. :param item: :return: """ try: if len(item.get('place', [])) == 1: cities = app.locators.find_cities() city = item.get('place', '')[0].get('name', '') if city: located = [c for c in cities if c['city'].lower() == city.lower()] if len(located) == 1: item.setdefault('dateline', {}) item['dateline']['located'] = located[0] item['dateline']['source'] = item.get('original_source', 'EFE') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'EFE')) except Exception as ex: logging.exception('EFE dateline extraction exception {}'.format(ex)) finally: item.pop('place', None)
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get("body_html") if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll("p") if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(" (Reuters) - ") if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(",")[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] # if not dateline we create one if "dateline" not in item: item["dateline"] = {} # there is already a dateline that is not Bangalore don't do anything just return elif ( "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper() ): return item["dateline"]["located"] = ( located[0] if len(located) > 0 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = item.get("original_source", "Reuters") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "Reuters"), ) return item except: logging.exception("Reuters dateline macro exception")
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()): first = ''.join(pars[1].itertext()) else: first = ''.join(pars[0].itertext()) city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get( 'city').upper(): return item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def ap_derive_dateline(self, item): """This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='html') for par in parsed.xpath('/html/div/child::*'): if not par.text: continue city, source, the_rest = par.text.partition(' (AP) _ ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len( located) == 1 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get( 'original_source', 'AP') item['dateline'][ 'text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) break return item except: logging.exception('AP dateline extraction exception')
def get_item_from_template(template): """Get item dict using data from template. :param dict template """ item = template.get('data', {}) item[ITEM_STATE] = CONTENT_STATE.SUBMITTED item['task'] = {'desk': template.get('template_desk'), 'stage': template.get('template_stage')} item['template'] = template.get('_id') item.pop('firstcreated', None) item.pop('versioncreated', None) # handle dateline dateline = item.get('dateline', {}) dateline['date'] = utcnow() if dateline.get('located'): dateline['text'] = format_dateline_to_locmmmddsrc(dateline['located'], dateline['date']) return item
def set_dateline(item, city, source, set_date=False, text=None): """Set the dateline for item""" if not city: return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} if set_date: item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = source if text: item['dateline']['text'] = text else: item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=source)
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if 'anpa_category' not in item: category_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='categories') if category_map: map_entry = next( (code for code in category_map['items'] if code['qcode'] == 'e' and code['is_active']), None) item['anpa_category'] = [{ 'qcode': 'e', 'name': map_entry['name'] }] if 'subject' not in item: qcode = '01000000' item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}] cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located and 'dateline' not in item: item['dateline'] = { 'date': item['firstcreated'], 'located': located[0] } item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc( located[0], get_date(item['firstcreated']), source=item['source']) return item except Exception as ex: logger.exception(ex)
def ap_derive_dateline(self, item): """This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get("body_html") if html: parsed = parse_html(html, content="html") for par in parsed.xpath("/div/child::*"): if not par.text: continue city, source, the_rest = par.text.partition(" (AP) _ ") if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(",")[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] item.setdefault("dateline", {}) item["dateline"]["located"] = ( located[0] if len(located) == 1 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = item.get("original_source", "AP") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "AP"), ) break return item except Exception: logging.exception("AP dateline extraction exception")
def derive_dateline(self, item): """ Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we have the correct country. :param item: :return: """ try: if len(item.get('place', [])) == 1: cities = app.locators.find_cities() city = item.get('place', '')[0].get('name', '') located = [c for c in cities if c['city'].lower() == city.lower()] if len(located) == 1: item.setdefault('dateline', {}) item['dateline']['located'] = located[0] item['dateline']['source'] = item.get('original_source', 'EFE') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'EFE')) item.pop('place') except Exception: logging.exception('EFE dateline extraction exception')
def update_to_pass_validation(item, **kwargs): """ This is a test macro that does what is required to ensure that a text item will pass publication validation. It is intended to be used to test auto publishing, that is publishing directly from ingest. At the moment virtually all content received from Reuters fails validation. :param item: :param kwargs: :return: """ try: lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get( req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'dateline' not in item: cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located: item['dateline'] = { 'date': item['firstcreated'], 'located': located[0] } item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc( located[0], get_date(item['firstcreated']), source=item['source']) return item except: logging.exception('Test update to pass validation macro exception')
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party") if parsed_el is not None: item["original_source"] = parsed_el.attrib.get( "FormalName", "ANA") parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( "NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len( language) else "" subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item["subject"] = self.format_subjects(subjects) item["body_html"] = (html.unescape( etree.tostring(xml.find( "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent" ), encoding="unicode")).replace( "<DataContent>", "").replace("</DataContent>", "").replace( "<P>", "<p>").replace("</P>", "</p>")) item["body_html"] = (item.get("body_html").replace( "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο " "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον " "για συγκεκριμένη χρήση.</p>", "", ).strip()) parsed_el = xml.findall( "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property" ) characteristics = self.parse_attribute_values( parsed_el, "WordCount") item["word_count"] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get("Value") # Anglicise the greek for Athens if required city = "Athens" if city == "Αθήνα" else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get("Value") # Normalise the country code country = "GR" if country == "GRC" else country cities = app.locators.find_cities() located = [ c for c in cities if c["city"] == city and c["country_code"] == country ] if len(located) == 1: item["dateline"]["located"] = located[0] item["dateline"]["source"] = provider.get("source") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], item.get("dateline", {}).get("date"), provider.get("source")) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item["versioncreated"] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg["subject"]) != "Formatted Editorial Story": return [] item["guid"] = msg["Message-ID"] date_tuple = email.utils.parsedate_tz(msg["Date"]) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone("utc")) item["firstcreated"] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace("\r\n", "").replace( " ", " ") else: charset = part.get_content_charset() json_str = body.decode(charset).replace( "\r\n", "").replace(" ", " ") mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item["original_source"] = mail_item.get( "Username", mail_item.get("Email Address", "")) item["headline"] = mail_item.get("Headline", "") item["abstract"] = mail_item.get("Abstract", "") item["slugline"] = mail_item.get("Slugline", "") item["body_html"] = "<p>" + mail_item.get( "Body", "").replace("\n", "</p><p>") + "</p>" default_source = app.config.get( "DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES") city = mail_item.get("Dateline", "") cities = app.locators.find_cities() located = [ c for c in cities if c["city"].lower() == city.lower() ] item.setdefault("dateline", {}) item["dateline"]["located"] = ( located[0] if len(located) > 0 else { "city_code": city, "city": city, "tz": "UTC", "dateline": "city" }) item["dateline"]["source"] = default_source item["dateline"][ "text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=default_source) if mail_item.get("Priority") != "": if mail_item.get("Priority", "3").isdigit(): item["priority"] = int( mail_item.get("Priority", "3")) else: priority_map = superdesk.get_resource_service( "vocabularies").find_one( req=None, _id="priority") priorities = [ x for x in priority_map.get("items", []) if x["name"].upper() == mail_item.get( "Priority", "").upper() ] if priorities is not None and len( priorities) > 0: item["priority"] = int( priorities[0].get("qcode", "3")) else: item["priority"] = 3 if mail_item.get("News Value") != "": item["urgency"] = int( mail_item.get("News Value", "3")) # We expect the username passed corresponds to a superdesk user query = { "email": re.compile( "^{}$".format( mail_item.get( "Username", mail_item.get("Email Address", ""))), re.IGNORECASE, ) } user = superdesk.get_resource_service( "users").find_one(req=None, **query) if not user: logger.error( "Failed to find user for email {}".format( mail_item.get( "Username", mail_item.get("Email Address", "")))) raise UserNotRegisteredException() item["original_creator"] = user.get("_id") if BYLINE in user and user.get(BYLINE, ""): item["byline"] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = { "name": re.compile( "^{}$".format(mail_item.get("Desk", "")), re.IGNORECASE) } desk = superdesk.get_resource_service( "desks").find_one(req=None, **query) if desk: item["task"] = { "desk": desk.get("_id"), "stage": desk.get("incoming_stage") } if "Place" in mail_item: locator_map = superdesk.get_resource_service( "vocabularies").find_one(req=None, _id="locators") place = [ x for x in locator_map.get("items", []) if x["qcode"] == mail_item.get( "Place", "").upper() ] if place is not None: item["place"] = place if mail_item.get("Legal flag", "") == "LEGAL": item["flags"] = {"marked_for_legal": True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, s_json, provider=None): in_item = s_json.get('data', {}).get('item') nitf_item = s_json.get('nitf', {}) item = { 'guid': in_item.get('altids', {}).get('itemid') + ':' + str(in_item.get('version')) } item['source'] = provider.get('source') if provider else 'AP' for copy_property in self.direct_copy_properties: if in_item.get(copy_property) is not None: item[copy_property] = in_item[copy_property] if in_item.get('version'): item['version'] = in_item['version'] if in_item.get('versioncreated'): item['versioncreated'] = self.datetime( in_item.get('versioncreated')) if in_item.get('firstcreated'): item['firstcreated'] = self.datetime(in_item.get('firstcreated')) if len(in_item.get('infosource', [])): item['original_source'] = ','.join( [n.get('name') for n in in_item.get('infosource', [])]) if in_item.get('datelinelocation'): cities = app.locators.find_cities() # Try to find a single matching city either by city and country or city country and state located = [ c for c in cities if c['city'] == in_item.get('datelinelocation').get('city') and c['country'] == in_item.get('datelinelocation').get( 'countryname') ] if len(located) > 1: located = [ c for c in cities if c['city'] == in_item.get('datelinelocation').get('city') and c['country'] == in_item.get('datelinelocation').get( 'countryname') and c['state'] == in_item.get( 'datelinelocation').get('countryareaname') ] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] item['dateline']['source'] = provider.get('source') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), provider.get('source')) if len(in_item.get('bylines', [])): item['byline'] = ','.join([ n.get('name') if n.get('name') else n.get('by', '') + (' ({})'.format(n.get('title')) if n.get('title') else '') for n in in_item.get('bylines', []) ]) if item.get('byline').startswith('By '): item['byline'] = item['byline'][3:] if len(in_item.get('usageterms', [])): item['usageterms'] = ', '.join( [n for n in in_item.get('usageterms', [])]) if in_item.get('type') == 'picture': if in_item.get('renditions'): self._parse_renditions(in_item['renditions'], item, provider) if in_item.get('description_caption'): item['description_text'] = in_item.get('description_caption') item['archive_description'] = in_item.get( 'description_caption') if in_item.get('description_creditline'): item['credit'] = in_item.get('description_creditline') if in_item.get('photographer', {}).get('name'): item['byline'] = in_item.get('photographer', {}).get('name') if in_item.get('type') == 'text': # Peel off the take key if possible if ',' in item['slugline']: item['anpa_take_key'] = item['slugline'].split(',')[1] item['slugline'] = item['slugline'].split(',')[0] if item['slugline'].startswith('BC-'): item['slugline'] = item['slugline'][3:] if item.get('ednote', '').startswith('Eds:'): item['ednote'] = item['ednote'][5:] if in_item.get('headline_extended'): item['abstract'] = in_item.get('headline_extended') self.categorisation_mapping(in_item, item) # Map the urgency to urgency and priority if in_item.get('urgency'): item[ITEM_URGENCY] = int( in_item['urgency']) if in_item['urgency'] <= 5 else 5 item[ITEM_PRIORITY] = self.priority_map.get( in_item['urgency'], 5) if nitf_item.get('body_html'): # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html')) item['body_html'] = nitf_item.get('body_html').replace( '<block id="Main">', '').replace('</block>', '') if s_json.get('associations'): self._parse_associations(s_json['associations'], item, provider) return item
def test_format_dateline_to_format_when_city_state_and_country_are_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() located['dateline'] = "city,state,country" formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, 'SYDNEY, NSW, AU, %s %s -' % (formatted_date, get_default_source()))
def test_format_dateline_to_format_when_only_city_is_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, 'SYDNEY, %s %s -' % (formatted_date, get_default_source()))
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header(msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace(' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace('\r\n', '').replace(' ', ' ') mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get('Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get('Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get('DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = default_source item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int(mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='priority') priorities = [x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get('Priority', '').upper()] if priorities is not None and len(priorities) > 0: item['priority'] = int(priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int(mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)} user = superdesk.get_resource_service('users').find_one(req=None, **query) if not user: logger.error('Failed to find user for email {}'.format(mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)} desk = superdesk.get_resource_service('desks').find_one( req=None, **query) if desk: item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')} if 'Place' in mail_item: locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') place = [x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get('Place', '').upper()] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def parse(self, s_json, provider=None): in_item = s_json.get("data", {}).get("item") nitf_item = s_json.get("nitf", {}) item = { "guid": in_item.get("altids", {}).get("itemid") + ":" + str(in_item.get("version")) } item["source"] = provider.get("source") if provider else "AP" for copy_property in self.direct_copy_properties: if in_item.get(copy_property) is not None: item[copy_property] = in_item[copy_property] if in_item.get("version"): item["version"] = in_item["version"] if in_item.get("versioncreated"): item["versioncreated"] = self.datetime( in_item.get("versioncreated")) if in_item.get("firstcreated"): item["firstcreated"] = self.datetime(in_item.get("firstcreated")) if len(in_item.get("infosource", [])): item["original_source"] = ",".join( [n.get("name") for n in in_item.get("infosource", [])]) if in_item.get("datelinelocation"): cities = app.locators.find_cities() # Try to find a single matching city either by city and country or city country and state located = [ c for c in cities if c["city"] == in_item.get("datelinelocation").get("city") and c["country"] == in_item.get("datelinelocation").get( "countryname") ] if len(located) > 1: located = [ c for c in cities if c["city"] == in_item.get("datelinelocation").get("city") and c["country"] == in_item.get("datelinelocation").get( "countryname") and c["state"] == in_item.get( "datelinelocation").get("countryareaname") ] if len(located) == 1: item["dateline"] = dict() item["dateline"]["located"] = located[0] item["dateline"]["source"] = provider.get("source") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), provider.get("source")) if len(in_item.get("bylines", [])): item["byline"] = ",".join([ n.get("name") if n.get("name") else n.get("by", "") + (" ({})".format(n.get("title")) if n.get("title") else "") for n in in_item.get("bylines", []) ]) if item.get("byline").startswith("By "): item["byline"] = item["byline"][3:] if len(in_item.get("usageterms", [])): item["usageterms"] = ", ".join( [n for n in in_item.get("usageterms", [])]) if in_item.get("type") == "picture": if in_item.get("renditions"): self._parse_renditions(in_item["renditions"], item, provider) if in_item.get("description_caption"): item["description_text"] = in_item.get("description_caption") item["archive_description"] = in_item.get( "description_caption") if in_item.get("description_creditline"): item["credit"] = in_item.get("description_creditline") if in_item.get("photographer", {}).get("name"): item["byline"] = in_item.get("photographer", {}).get("name") if in_item.get("type") == "text": # Peel off the take key if possible if "," in item["slugline"]: item["anpa_take_key"] = item["slugline"].split(",")[1] item["slugline"] = item["slugline"].split(",")[0] if item["slugline"].startswith("BC-"): item["slugline"] = item["slugline"][3:] if item.get("ednote", "").startswith("Eds:"): item["ednote"] = item["ednote"][5:] if in_item.get("headline_extended"): item["abstract"] = in_item.get("headline_extended") self.categorisation_mapping(in_item, item) # Map the urgency to urgency and priority if in_item.get("urgency"): item[ITEM_URGENCY] = int( in_item["urgency"]) if in_item["urgency"] <= 5 else 5 item[ITEM_PRIORITY] = self.priority_map.get( in_item["urgency"], 5) if nitf_item.get("body_html"): # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html')) item["body_html"] = nitf_item.get("body_html").replace( '<block id="Main">', "").replace("</block>", "") if s_json.get("associations"): self._parse_associations(s_json["associations"], item, provider) return item
def update_dateline(item): # handle dateline dateline = item.get("dateline", {}) dateline["date"] = utcnow() if dateline.get("located"): dateline["text"] = format_dateline_to_locmmmddsrc(dateline["located"], dateline["date"])
def ap_weather_format(item, **kwargs): if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP': raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table") item['slugline'] = 'WORLD WEATHER' text = get_text(item['body_html'], content='html') lines = text.splitlines() if not lines[0] == 'BC-WEA--Global Weather-Celsius,<': raise SuperdeskApiError.badRequestError("Table should be in Celsius only") # tabular column max lengths are extracted into this list columns = [] # map of the columns to extract and the substitutions to apply to the column columnMap = ({'index': 0}, {'index': 1}, {'index': 2}, {'index': 3, 'substitute': [('COND', 'CONDITIONS'), ('pc', 'partly cloudy'), ('clr', 'clear'), ('cdy', 'cloudy'), ('rn', 'rain'), ('sn', 'snow')]}) # story preamble preamble = 'Temperatures and conditions in world centres:\r\n' output = StringIO() output.write(preamble) # story is always datelined News York city = 'New York City' cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = 'AP' item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d') item['subject'] = [{"name": "weather", "qcode": "17000000"}] locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US'] if lines: # scan all the lines in the file for potential collimated lines and calculate the length # of the column for line in lines: row = re.split('[;\<]+', line) # only consider it if there are more than two rows if len(row) > 2: index = 0 for col in row: # check if the column is mapped map = [me for me in columnMap if me['index'] == index] if len(map): for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) # if it's a new column if 0 <= index < len(columns): # check the length if len(col) > columns[index]: columns[index] = len(col) else: columns.append(len(col)) index += 1 for line in lines: row = re.split('[;\<]+', line) if len(row) > 2: index = 0 for col in row: map = [me for me in columnMap if me['index'] == index] if len(map) > 0: for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) output.write( '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n')) index += 1 output.write('\r\n') item['body_html'] = '<pre>' + output.getvalue() + '</pre>' return item
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( 'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party') if parsed_el is not None: item['original_source'] = parsed_el.attrib.get( 'FormalName', 'ANA') parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len( language) else '' subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item['subject'] = self.format_subjects(subjects) item['body_html'] = html.unescape( etree.tostring(xml.find( 'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent' ), encoding='unicode')).replace( '<DataContent>', '').replace('</DataContent>', '').replace( '<P>', '<p>').replace('</P>', '</p>') item['body_html'] = item.get('body_html').replace( '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο ' 'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον ' 'για συγκεκριμένη χρήση.</p>', '').strip() parsed_el = xml.findall( 'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property' ) characteristics = self.parse_attribute_values( parsed_el, 'WordCount') item['word_count'] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get('Value') # Anglicise the greek for Athens if required city = 'Athens' if city == 'Αθήνα' else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get('Value') # Normalise the country code country = 'GR' if country == 'GRC' else country cities = app.locators.find_cities() located = [ c for c in cities if c['city'] == city and c['country_code'] == country ] if len(located) == 1: item['dateline']['located'] = located[0] item['dateline']['source'] = provider.get('source') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], item.get('dateline', {}).get('date'), provider.get('source')) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def _parse_formatted_email(self, data, provider): """Construct an item from an email that was constructed as a notification from a google form submission. The google form submits to a google sheet, this sheet creates the email as a notification :param data: :param provider: :return: A list of 1 item """ try: item = dict() item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['versioncreated'] = utcnow() for response_part in data: if isinstance(response_part, tuple): msg = email.message_from_bytes(response_part[1]) # Check that the subject line matches what we expect, ignore it if not if self.parse_header( msg['subject']) != 'Formatted Editorial Story': return [] item['guid'] = msg['Message-ID'] date_tuple = email.utils.parsedate_tz(msg['Date']) if date_tuple: dt = datetime.datetime.utcfromtimestamp( email.utils.mktime_tz(date_tuple)) dt = dt.replace(tzinfo=timezone('utc')) item['firstcreated'] = dt for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) # if we don't know the charset just have a go! if part.get_content_charset() is None: json_str = body.decode().replace('\r\n', '').replace( ' ', ' ') else: charset = part.get_content_charset() json_str = body.decode(charset).replace( '\r\n', '').replace(' ', ' ') mail_item = dict( (k, v[0]) for k, v in json.loads(json_str).items()) self._expand_category(item, mail_item) item['original_source'] = mail_item.get( 'Username', '') item['headline'] = mail_item.get('Headline', '') item['abstract'] = mail_item.get('Abstract', '') item['slugline'] = mail_item.get('Slugline', '') item['body_html'] = '<p>' + mail_item.get( 'Body', '').replace('\n', '</p><p>') + '</p>' default_source = app.config.get( 'DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES') city = mail_item.get('Dateline', '') cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = default_source item['dateline'][ 'text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=default_source) if mail_item.get('Priority') != '': if mail_item.get('Priority', '3').isdigit(): item['priority'] = int( mail_item.get('Priority', '3')) else: priority_map = superdesk.get_resource_service( 'vocabularies').find_one( req=None, _id='priority') priorities = [ x for x in priority_map.get('items', []) if x['name'].upper() == mail_item.get( 'Priority', '').upper() ] if priorities is not None and len( priorities) > 0: item['priority'] = int( priorities[0].get('qcode', '3')) else: item['priority'] = 3 if mail_item.get('News Value') != '': item['urgency'] = int( mail_item.get('News Value', '3')) # We expect the username passed corresponds to a superdesk user query = { 'email': re.compile( '^{}$'.format(mail_item.get('Username')), re.IGNORECASE) } user = superdesk.get_resource_service( 'users').find_one(req=None, **query) if not user: logger.error( 'Failed to find user for email {}'.format( mail_item.get('Username'))) raise UserNotRegisteredException() item['original_creator'] = user.get('_id') if BYLINE in user and user.get(BYLINE, ''): item['byline'] = user.get(BYLINE) item[SIGN_OFF] = user.get(SIGN_OFF) # attempt to match the given desk name against the defined desks query = { 'name': re.compile( '^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE) } desk = superdesk.get_resource_service( 'desks').find_one(req=None, **query) if desk: item['task'] = { 'desk': desk.get('_id'), 'stage': desk.get('incoming_stage') } if 'Place' in mail_item: locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == mail_item.get( 'Place', '').upper() ] if place is not None: item['place'] = place if mail_item.get('Legal flag', '') == 'LEGAL': item['flags'] = {'marked_for_legal': True} break return [item] except Exception as ex: raise IngestEmailError.emailParseError(ex, provider)
def test_format_dateline_to_format_when_only_city_is_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, 'SYDNEY %s %s -' % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))
def test_format_dateline_to_format_when_only_city_and_state_are_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() located["dateline"] = "city,state" formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, "SYDNEY, NSW %s %s -" % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))
def test_format_dateline_to_format_when_only_city_and_country_are_present(self): located, formatted_date, current_ts = self._get_located_and_current_utc_ts() located['dateline'] = "city,country" formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts) self.assertEqual(formatted_dateline, 'SYDNEY, AU %s %s -' % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))