def dpa_derive_dateline(item, **kwargs): """ DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :param kwargs: :return: """ lines = item["body_html"].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(" (dpa) - ") # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city) < 20: cities = find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] if "dateline" not in item: item["dateline"] = {} item["dateline"]["located"] = ( located[0] if len(located) > 0 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = "dpa" item["dateline"]["text"] = city lines[line_num] = lines[line_num].replace(city + source, "") item["body_html"] = "\r\n".join(lines) break return item
def set_dateline(self, item, city=None, text=None): """ Sets the 'dateline' to the article identified by item. If city is passed then the system checks if city is available in Cities collection. If city is not found in Cities collection then dateline's located is set with default values. :param item: article. :type item: dict :param city: Name of the city, if passed the system will search in Cities collection. :type city: str :param text: dateline in full. For example, "STOCKHOLM, Aug 29, 2014" :type text: str """ item.setdefault('dateline', {}) if city: cities = find_cities() located = [c for c in cities if c['city'] == city] item['dateline']['located'] = located[0] if len(located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } if text: item['dateline']['text'] = text
def dpa_derive_dateline(item, **kwargs): """ DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :param kwargs: :return: """ lines = item['body_html'].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city) < 20: cities = find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = 'dpa' item['dateline']['text'] = city lines[line_num] = lines[line_num].replace(city + source, '') item['body_html'] = '\r\n'.join(lines) break return item
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if "anpa_category" not in item: category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories") if category_map: map_entry = next( (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None ) item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}] if "subject" not in item: qcode = "01000000" item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}] cities = find_cities(country_code="AU", state_code="NSW") located = [c for c in cities if c["city"].lower() == "sydney"] if located and "dateline" not in item: item["dateline"] = {"date": item["firstcreated"], "located": located[0]} item["dateline"]["source"] = item["source"] item["dateline"]["text"] = format_dateline_to_locmmmddsrc( located[0], get_date(item["firstcreated"]), source=item["source"] ) return item except Exception as ex: logger.exception(ex)
def update_to_pass_validation(item, **kwargs): """ This is a test macro that does what is required to ensure that a text item will pass publication validation. It is intended to be used to test auto publishing, that is publishing directly from ingest. At the moment virtually all content received from Reuters fails validation. :param item: :param kwargs: :return: """ try: lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'dateline' not in item: cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located: item['dateline'] = {'date': item['firstcreated'], 'located': located[0]} item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']), source=item['source']) return item except: logging.exception('Test update to pass validation macro exception')
def update_to_pass_validation(item, **kwargs): """ This is a test macro that does what is required to ensure that a text item will pass publication validation. It is intended to be used to test auto publishing, that is publishing directly from ingest. At the moment virtually all content received from Reuters fails validation. :param item: :param kwargs: :return: """ try: lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT} validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup) if validators.count(): max_slugline_len = validators[0]['schema']['slugline']['maxlength'] max_headline_len = validators[0]['schema']['headline']['maxlength'] item['slugline'] = item['slugline'][:max_slugline_len] \ if len(item['slugline']) > max_slugline_len else item['slugline'] item['headline'] = item['headline'][:max_headline_len] \ if len(item['headline']) > max_headline_len else item['headline'] if 'dateline' not in item: cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located: item['dateline'] = {'date': item['firstcreated'], 'located': located[0]} item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']), source=item['source']) return item except: logging.exception('Test update to pass validation macro exception')
def dpa_derive_dateline(self, item): """ This function attempts to parse a dateline from the first few lines of the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :return: """ lines = item['body_html'].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20: cities = find_cities() located = [c for c in cities if c['city'].lower() == city.strip().lower()] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city.strip(), 'city': city.strip(), 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = 'dpa' item['dateline']['text'] = city.strip() item['body_html'] = item['body_html'].replace(city + source, '', 1) break return item
def ap_derive_dateline(self, item): """ This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll('p') if len(pars) >= 2: first = pars[0].get_text() city, source, the_rest = first.partition(' (AP) _ ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'AP') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) return item except: logging.exception('AP dateline extraction exception')
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll('p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item[ 'dateline']['located'].get('city').upper(): return item['dateline']['located'] = located[0] if len( located) > 0 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get( 'original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get("body_html") if html: soup = BeautifulSoup(html, "html.parser") pars = soup.findAll("p") if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(" (Reuters) - ") if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(",")[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [c for c in cities if c["city"].lower() == city.lower()] # if not dateline we create one if "dateline" not in item: item["dateline"] = {} # there is already a dateline that is not Bangalore don't do anything just return elif ( "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper() ): return item["dateline"]["located"] = ( located[0] if len(located) > 0 else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"} ) item["dateline"]["source"] = item.get("original_source", "Reuters") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], get_date(item["firstcreated"]), source=item.get("original_source", "Reuters"), ) return item except: logging.exception("Reuters dateline macro exception")
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: soup = BeautifulSoup(html) pars = soup.findAll('p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in pars[0].get_text(): first = pars[1].get_text() else: first = pars[0].get_text() city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get( 'city').upper(): return item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def set_dateline(self, item, city=None, text=None): """ Sets the 'dateline' to the article identified by item. If city is passed then the system checks if city is available in Cities collection. If city is not found in Cities collection then dateline's located is set with default values. :param item: article. :param city: Name of the city, if passed the system will search in Cities collection. :param text: dateline in full. For example, "STOCKHOLM, Aug 29, 2014" """ item['dateline'] = {} if city: cities = find_cities() located = [c for c in cities if c['city'] == city] item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} if text: item['dateline']['text'] = text
def noise11_derive_metadata(item, **kwargs): """ By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline appropriately :param item: :param kwargs: :return: """ try: if 'anpa_category' not in item: category_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='categories') if category_map: map_entry = next( (code for code in category_map['items'] if code['qcode'] == 'e' and code['is_active']), None) item['anpa_category'] = [{ 'qcode': 'e', 'name': map_entry['name'] }] if 'subject' not in item: qcode = '01000000' item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}] cities = find_cities(country_code='AU', state_code='NSW') located = [c for c in cities if c['city'].lower() == 'sydney'] if located and 'dateline' not in item: item['dateline'] = { 'date': item['firstcreated'], 'located': located[0] } item['dateline']['source'] = item['source'] item['dateline']['text'] = format_dateline_to_locmmmddsrc( located[0], get_date(item['firstcreated']), source=item['source']) return item except Exception as ex: logger.exception(ex)