コード例 #1
0
def _yonhap_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = sd_etree.parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Yonhap) -- ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    set_dateline(item, city, 'Yonhap')
                    break

        return item
    except:
        logging.exception('Yonhap dateline macro exception')
コード例 #2
0
    def post_process_item(self, item, provider):
        try:
            is_broadcast_script = False
            item['body_html'] = '<p>{}</p>'.format(
                re.sub(
                    '<p>   ', '<p>',
                    item.get('body_html',
                             '').replace('\n\n',
                                         '\n').replace('\n', '</p><p>')))
            if not item.get('genre'):
                genre_map = superdesk.get_resource_service(
                    'vocabularies').find_one(req=None, _id='genre')
                item['genre'] = [
                    x for x in genre_map.get('items', [])
                    if x['qcode'] == 'Broadcast Script' and x['is_active']
                ]
                item['sign_off'] = 'RTV'
                is_broadcast_script = True

            if self.ITEM_PLACE in item:
                if item[self.ITEM_PLACE] and is_broadcast_script:
                    item['headline'] = '{}: {}'.format(
                        item[self.ITEM_PLACE],
                        item.get(self.ITEM_HEADLINE, ''))
                locator_map = superdesk.get_resource_service(
                    'vocabularies').find_one(req=None, _id='locators')
                place = [
                    x for x in locator_map.get('items', [])
                    if x['qcode'] == item.get(self.ITEM_PLACE, '').upper()
                ]
                if place is not None:
                    item[self.ITEM_PLACE] = place
                else:
                    item.pop(self.ITEM_PLACE)

            if item.get('genre') and item.get('genre')[0] and item.get(
                    'genre')[0].get('qcode') == 'AM Service':
                item['firstcreated'] = utcnow()
                item['abstract'] = item['headline']
                slugline = (item.get('slugline') or '').lower()
                dateline_city = ''
                for city in [
                        'sydney', 'melbourne', 'brisbane', 'adelaide', 'perth'
                ]:
                    if city in slugline:
                        dateline_city = city
                        break

                set_dateline(item,
                             dateline_city,
                             provider.get('source'),
                             set_date=True)

            # Remove the attribution
            item['body_html'] = item.get('body_html',
                                         '').replace('<p>AAP RTV</p>', '')
        except Exception as ex:
            logger.exception(ex)

        return item
コード例 #3
0
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return

                    # there is already a dateline that is not Bangalore/BENGALURU don't do anything just return
                    if 'located' in (item.get('dateline') or {}) and \
                            item['dateline']['located'].get('city').upper() not in ['BANGALORE', 'BENGALURU']:
                        return

                    set_dateline(item, city, 'Reuters')
                    break

        return item
    except:
        logging.exception('Reuters dateline macro exception')
コード例 #4
0
def dpa_derive_dateline(item, **kwargs):
    """
    DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of
    the item body and populate the dataline location, it also populates the dateline source.
    If a dateline is matched the coresponding string is removed from the article text.
    :param item:
    :param kwargs:
    :return:
    """
    lines = item['body_html'].splitlines()
    if lines:
        # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
        for line_num in range(0, min(len(lines), 5)):
            city, source, the_rest = lines[line_num].partition(' (dpa) - ')
            # test if we found a candidate and ensure that the city starts the line and is not crazy long
            if source and lines[line_num].find(city) == 0 and len(city) < 20:
                set_dateline(item, city, 'dpa', text=city)
                lines[line_num] = lines[line_num].replace(city + source, '')
                item['body_html'] = '\r\n'.join(lines)
                break
    return item
コード例 #5
0
def ap_weather_format(item, **kwargs):
    if not item.get('slugline',
                    '').startswith('WEA--GlobalWeather-Ce') or not item.get(
                        'source', '') == 'AP':
        raise SuperdeskApiError.badRequestError(
            "Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError(
            "Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({
        'index': 0
    }, {
        'index': 1
    }, {
        'index': 2
    }, {
        'index':
        3,
        'substitute': [('COND', 'CONDITIONS'), ('pc', 'partly cloudy'),
                       ('clr', 'clear'), ('cdy', 'cloudy'), ('rn', 'rain'),
                       ('sn', 'snow')]
    })
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()
    output.write(preamble)

    # story is always dateline News York
    set_dateline(item, 'New York City', 'AP', set_date=True)

    item['headline'] = 'World Weather for ' + item['dateline'][
        'date'].strftime('%b %-d')
    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(
        req=None, _id='locators')
    item['place'] = [
        x for x in locator_map.get('items', []) if x['qcode'] == 'US'
    ]

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split(r'[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    else:
                        columns.append(len(col))
                    index += 1

        for line in lines:
            row = re.split(r'[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                        output.write('{}'.format(
                            col.lstrip(
                                '\t').ljust(columns[map[0].get('index')] +
                                            2)).rstrip('\r\n'))
                    index += 1
                output.write('\r\n')

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
コード例 #6
0
def process_victorian_harness_racing(item, **kwargs):

    number_words_map = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five',
                        6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten',
                        11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen',
                        15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen',
                        19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty',
                        50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty',
                        90: 'Ninety', 0: 'Zero'}

    substitution_map = OrderedDict({"second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th",
                                    "seventh": "7th", "eighth": "8th", "ninth": "9th", "2nd row": "second row",
                                    "2nd up": "second up", "2nd line": "second line", "2nd run": "second run",
                                    "2nd pick": "second pick", "January": "Jan", "February": "Feb", "August": "Aug",
                                    "September": "Sept", "October": "Oct", "November": "Nov", "December": "Dec",
                                    "Harold Park": "HP", "Moonee Valley": "MV"})

    def race_number_to_words(race):
        n = int(race.replace('Race', '').replace(':', ''))
        try:
            return titlecase(number_words_map[n])
        except KeyError:
            try:
                return titlecase(number_words_map[n - n % 10] + number_words_map[n % 10].lower())
            except KeyError:
                return str(n)

    content = item.get('body_html', '')
    comment_item = {
        "anpa_category": [
            {
                "qcode": "r",
                "name": "Racing (Turf)",
                "subject": "15030001"
            }
        ],
        "subject": [
            {
                "parent": "15000000",
                "name": "horse racing, harness racing",
                "qcode": "15030000"
            }
        ],
        "place": [
            {
                "state": "Victoria",
                "name": "VIC",
                "group": "Australia",
                "country": "Australia",
                "qcode": "VIC",
                "world_region": "Oceania"
            }
        ],
        FORMAT: FORMATS.HTML,
        ITEM_TYPE: CONTENT_TYPE.TEXT
    }
    selections_item = deepcopy(comment_item)
    # copy the genre of the item that we are oprerting on
    if 'genre' in item:
        selections_item['genre'] = deepcopy(item['genre'])

    parsed = parse_html(content, content='html')

    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            if tag.text.startswith('VENUE: '):
                venue = tag.text.replace('VENUE: ', '')
            elif tag.text.startswith('DATE: '):
                try:
                    meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%y')
                except Exception:
                    logger.warning('Date format exception for {}'.format(tag.text.replace('DATE: ', '')))
                    try:
                        meeting_date = datetime.strptime(tag.text.replace('DATE: ', '').replace(' ', ''), '%d/%m/%Y')
                    except Exception:
                        logger.warning('Date format exception 2 for {}'.format(tag.text.replace('DATE: ', '')))
                        try:
                            meeting_date = get_date(tag.text.replace('DATE: ', '').replace(' ', ''))
                        except Exception:
                            logger.warning('Date format exception 3 for {}'.format(tag.text.replace('DATE: ', '')))
                            meeting_date = utcnow()

                comment_item['slugline'] = venue + ' Comment'
                comment_item['anpa_take_key'] = meeting_date.strftime('%A')
                comment_item['headline'] = venue + ' Trot Comment ' + meeting_date.strftime('%A')
                comment_item['firstcreated'] = utcnow()
                set_dateline(comment_item, 'Melbourne', 'AAP')

                selections_item['slugline'] = venue + ' Selections'
                selections_item['anpa_take_key'] = meeting_date.strftime('%A')
                selections_item['headline'] = venue + ' Trot Selections ' + meeting_date.strftime('%A')
                selections_item['firstcreated'] = utcnow()
                set_dateline(selections_item, 'Melbourne', 'AAP')
                selections_item['body_html'] = '<p>{} Selections for {}\'s {} trots.-</p>'.format(
                    selections_item.get('dateline').get('text'),
                    meeting_date.strftime('%A'), venue)
                selections_item['firstcreated'] = utcnow()
                break

    regex = r"Race ([1-9][0-9]|[1-9]):"
    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            m = re.match(regex, tag.text)
            if m:
                selections_item['body_html'] += '<p>{} '.format(tag.text)
            if tag.text.startswith('SELECTIONS: '):
                sels = titlecase(tag.text.replace('SELECTIONS: ', ''))
                # In some cases there is no comma between the selections, apparently there should be!
                sels = sels.replace(') ', '), ')
                sels = re.sub(r'\s\(.*?\)', '', sels)
                # get rid of the trailing one
                sels = re.sub(r'(, $|,$)', ' ', sels)
                selections_item['body_html'] += '{}</p>'.format(sels)
    selections_item['body_html'] += '<p>AAP SELECTIONS</p>'

    comment_item['body_html'] = ''
    overview = ''
    regex = r"Race ([1-9][0-9]|[1-9]):"
    for tag in parsed.xpath('/html/div/child::*'):
        if tag.tag == 'p':
            m = re.match(regex, tag.text)
            if m:
                comment_item['body_html'] += '<p>Race {}:</p>'.format(race_number_to_words(tag.text))
            if tag.text.startswith('EARLY SPEED: '):
                comment_item['body_html'] += '<p>{}</p>'.format(overview.rstrip())
                overview = ''
                comment_item['body_html'] += '<p>{}</p>'.format(tag.text.rstrip())
            if tag.text.startswith('OVERVIEW: '):
                overview = tag.text
            elif overview:
                overview += tag.text

    for i, j in substitution_map.items():
        comment_item['body_html'] = comment_item['body_html'].replace(i, j)
    comment_item['body_html'] += '<p>AAP COMMENT</p>'

    service = get_resource_service('archive')
    selections_item['task'] = item.get('task')
    selections_item['profile'] = item.get('profile')
    selections_item[ITEM_STATE] = CONTENT_STATE.PROGRESS
    service.post([selections_item])

    item.update(comment_item)

    return item