def _yonhap_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = sd_etree.parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Yonhap) -- ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return set_dateline(item, city, 'Yonhap') break return item except: logging.exception('Yonhap dateline macro exception')
def post_process_item(self, item, provider): try: is_broadcast_script = False item['body_html'] = '<p>{}</p>'.format( re.sub( '<p> ', '<p>', item.get('body_html', '').replace('\n\n', '\n').replace('\n', '</p><p>'))) if not item.get('genre'): genre_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='genre') item['genre'] = [ x for x in genre_map.get('items', []) if x['qcode'] == 'Broadcast Script' and x['is_active'] ] item['sign_off'] = 'RTV' is_broadcast_script = True if self.ITEM_PLACE in item: if item[self.ITEM_PLACE] and is_broadcast_script: item['headline'] = '{}: {}'.format( item[self.ITEM_PLACE], item.get(self.ITEM_HEADLINE, '')) locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') place = [ x for x in locator_map.get('items', []) if x['qcode'] == item.get(self.ITEM_PLACE, '').upper() ] if place is not None: item[self.ITEM_PLACE] = place else: item.pop(self.ITEM_PLACE) if item.get('genre') and item.get('genre')[0] and item.get( 'genre')[0].get('qcode') == 'AM Service': item['firstcreated'] = utcnow() item['abstract'] = item['headline'] slugline = (item.get('slugline') or '').lower() dateline_city = '' for city in [ 'sydney', 'melbourne', 'brisbane', 'adelaide', 'perth' ]: if city in slugline: dateline_city = city break set_dateline(item, dateline_city, provider.get('source'), set_date=True) # Remove the attribution item['body_html'] = item.get('body_html', '').replace('<p>AAP RTV</p>', '') except Exception as ex: logger.exception(ex) return item
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return # there is already a dateline that is not Bangalore/BENGALURU don't do anything just return if 'located' in (item.get('dateline') or {}) and \ item['dateline']['located'].get('city').upper() not in ['BANGALORE', 'BENGALURU']: return set_dateline(item, city, 'Reuters') break return item except: logging.exception('Reuters dateline macro exception')
def dpa_derive_dateline(item, **kwargs): """ DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of the item body and populate the dataline location, it also populates the dateline source. If a dateline is matched the coresponding string is removed from the article text. :param item: :param kwargs: :return: """ lines = item['body_html'].splitlines() if lines: # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it. for line_num in range(0, min(len(lines), 5)): city, source, the_rest = lines[line_num].partition(' (dpa) - ') # test if we found a candidate and ensure that the city starts the line and is not crazy long if source and lines[line_num].find(city) == 0 and len(city) < 20: set_dateline(item, city, 'dpa', text=city) lines[line_num] = lines[line_num].replace(city + source, '') item['body_html'] = '\r\n'.join(lines) break return item
def ap_weather_format(item, **kwargs): if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get( 'source', '') == 'AP': raise SuperdeskApiError.badRequestError( "Article should be an AP sourced weather table") item['slugline'] = 'WORLD WEATHER' text = get_text(item['body_html'], content='html') lines = text.splitlines() if not lines[0] == 'BC-WEA--Global Weather-Celsius,<': raise SuperdeskApiError.badRequestError( "Table should be in Celsius only") # tabular column max lengths are extracted into this list columns = [] # map of the columns to extract and the substitutions to apply to the column columnMap = ({ 'index': 0 }, { 'index': 1 }, { 'index': 2 }, { 'index': 3, 'substitute': [('COND', 'CONDITIONS'), ('pc', 'partly cloudy'), ('clr', 'clear'), ('cdy', 'cloudy'), ('rn', 'rain'), ('sn', 'snow')] }) # story preamble preamble = 'Temperatures and conditions in world centres:\r\n' output = StringIO() output.write(preamble) # story is always dateline News York set_dateline(item, 'New York City', 'AP', set_date=True) item['headline'] = 'World Weather for ' + item['dateline'][ 'date'].strftime('%b %-d') item['subject'] = [{"name": "weather", "qcode": "17000000"}] locator_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='locators') item['place'] = [ x for x in locator_map.get('items', []) if x['qcode'] == 'US' ] if lines: # scan all the lines in the file for potential collimated lines and calculate the length # of the column for line in lines: row = re.split(r'[;\<]+', line) # only consider it if there are more than two rows if len(row) > 2: index = 0 for col in row: # check if the column is mapped map = [me for me in columnMap if me['index'] == index] if len(map): for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) # if it's a new column if 0 <= index < len(columns): # check the length if len(col) > columns[index]: columns[index] = len(col) else: columns.append(len(col)) index += 1 for line in lines: row = re.split(r'[;\<]+', line) if len(row) > 2: index = 0 for col in row: map = [me for me in columnMap if me['index'] == index] if len(map) > 0: for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) output.write('{}'.format( col.lstrip( '\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n')) index += 1 output.write('\r\n') item['body_html'] = '<pre>' + output.getvalue() + '</pre>' return item
def process_victorian_harness_racing(item, **kwargs): number_words_map = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', 6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', 11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', 15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty', 50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty', 90: 'Ninety', 0: 'Zero'} substitution_map = OrderedDict({"second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th", "seventh": "7th", "eighth": "8th", "ninth": "9th", "2nd row": "second row", "2nd up": "second up", "2nd line": "second line", "2nd run": "second run", "2nd pick": "second pick", "January": "Jan", "February": "Feb", "August": "Aug", "September": "Sept", "October": "Oct", "November": "Nov", "December": "Dec", "Harold Park": "HP", "Moonee Valley": "MV"}) def race_number_to_words(race): n = int(race.replace('Race', '').replace(':', '')) try: return titlecase(number_words_map[n]) except KeyError: try: return titlecase(number_words_map[n - n % 10] + number_words_map[n % 10].lower()) except KeyError: return str(n) content = item.get('body_html', '') comment_item = { "anpa_category": [ { "qcode": "r", "name": "Racing (Turf)", "subject": "15030001" } ], "subject": [ { "parent": "15000000", "name": "horse racing, harness racing", "qcode": "15030000" } ], "place": [ { "state": "Victoria", "name": "VIC", "group": "Australia", "country": "Australia", "qcode": "VIC", "world_region": "Oceania" } ], FORMAT: FORMATS.HTML, ITEM_TYPE: CONTENT_TYPE.TEXT } selections_item = deepcopy(comment_item) # copy the genre of the item that we are oprerting on if 'genre' in item: selections_item['genre'] = deepcopy(item['genre']) parsed = parse_html(content, content='html') for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': if tag.text.startswith('VENUE: '): venue = tag.text.replace('VENUE: ', '') elif tag.text.startswith('DATE: '): try: meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%y') except Exception: logger.warning('Date format exception for {}'.format(tag.text.replace('DATE: ', ''))) try: meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%Y') except Exception: logger.warning('Date format exception 2 for {}'.format(tag.text.replace('DATE: ', ''))) try: meeting_date = get_date(tag.text.replace('DATE: ', '').replace(' ', '')) except Exception: logger.warning('Date format exception 3 for {}'.format(tag.text.replace('DATE: ', ''))) meeting_date = utcnow() comment_item['slugline'] = venue + ' Comment' comment_item['anpa_take_key'] = meeting_date.strftime('%A') comment_item['headline'] = venue + ' Trot Comment ' + meeting_date.strftime('%A') comment_item['firstcreated'] = utcnow() set_dateline(comment_item, 'Melbourne', 'AAP') selections_item['slugline'] = venue + ' Selections' selections_item['anpa_take_key'] = meeting_date.strftime('%A') selections_item['headline'] = venue + ' Trot Selections ' + meeting_date.strftime('%A') selections_item['firstcreated'] = utcnow() set_dateline(selections_item, 'Melbourne', 'AAP') selections_item['body_html'] = '<p>{} Selections for {}\'s {} trots.-</p>'.format( selections_item.get('dateline').get('text'), meeting_date.strftime('%A'), venue) selections_item['firstcreated'] = utcnow() break regex = r"Race ([1-9][0-9]|[1-9]):" for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': m = re.match(regex, tag.text) if m: selections_item['body_html'] += '<p>{} '.format(tag.text) if tag.text.startswith('SELECTIONS: '): sels = titlecase(tag.text.replace('SELECTIONS: ', '')) # In some cases there is no comma between the selections, apparently there should be! sels = sels.replace(') ', '), ') sels = re.sub(r'\s\(.*?\)', '', sels) # get rid of the trailing one sels = re.sub(r'(, $|,$)', ' ', sels) selections_item['body_html'] += '{}</p>'.format(sels) selections_item['body_html'] += '<p>AAP SELECTIONS</p>' comment_item['body_html'] = '' overview = '' regex = r"Race ([1-9][0-9]|[1-9]):" for tag in parsed.xpath('/html/div/child::*'): if tag.tag == 'p': m = re.match(regex, tag.text) if m: comment_item['body_html'] += '<p>Race {}:</p>'.format(race_number_to_words(tag.text)) if tag.text.startswith('EARLY SPEED: '): comment_item['body_html'] += '<p>{}</p>'.format(overview.rstrip()) overview = '' comment_item['body_html'] += '<p>{}</p>'.format(tag.text.rstrip()) if tag.text.startswith('OVERVIEW: '): overview = tag.text elif overview: overview += tag.text for i, j in substitution_map.items(): comment_item['body_html'] = comment_item['body_html'].replace(i, j) comment_item['body_html'] += '<p>AAP COMMENT</p>' service = get_resource_service('archive') selections_item['task'] = item.get('task') selections_item['profile'] = item.get('profile') selections_item[ITEM_STATE] = CONTENT_STATE.PROGRESS service.post([selections_item]) item.update(comment_item) return item