Esempio n. 1
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.HTML,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            body_html = to_ascii(html.escape(data)).replace('\n\n',
                                                            '</p><p>').replace(
                                                                '\n', ' ')
            item['body_html'] = '<p>' + body_html + '</p>'
            item['word_count'] = get_word_count(item['body_html'])

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
Esempio n. 2
0
    def parse(self, filename, provider=None):
        try:
            item = {}
            self.set_item_defaults(item, filename)
            with open(filename, 'r', encoding='windows-1252') as f:
                # read the whole file into a single string
                lines = f.read()
                # Construct pattern for the regular expression
                pattern = '(.*)\n'
                for f in self.field_list:
                    pattern = pattern + f[0] + '(.*)\n'
                m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL)
                if m:
                    for f in self.field_list:
                        if f[1] is not None:
                            item[f[1]] = m.group(f[2])

            # fix the formatting
            item[self.ITEM_VERSION_CREATED] = self.datetime(
                item[self.ITEM_VERSION_CREATED])
            item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\
                                        + '</p>'
            item.setdefault('word_count', get_word_count(item['body_html']))

            return item
        except Exception as ex:
            raise AAPParserError.NewsBitesParserError(exception=ex,
                                                      provider=provider)
Esempio n. 3
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.PRESERVED,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, data = data.split('\n\n', 2)

            self._process_header(item, header)
            self._process_dateline(item, dateline_data)

            item['original_source'] = 'AsiaNet'
            item['word_count'] = get_text_word_count(data)
            item['body_html'] = '<pre>' + html.escape(data) + '</pre>'

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
Esempio n. 4
0
    def parse(self, filename, provider=None):
        try:
            item = {}
            self.set_item_defaults(item, provider)

            with open(filename, 'r', encoding='latin-1') as f:
                lines = f.readlines()
                header = False
                body = False
                for line in lines:
                    if self.START_OF_MESSAGE in line and not header:
                        item['guid'] = filename + str(uuid.uuid4())
                        header = True
                        continue
                    if header:
                        if line == '\n':
                            continue
                        if line[0] in self.header_map:
                            if self.header_map[line[0]]:
                                item[self.header_map[line[0]]] = line[1:-1]
                            continue
                        if line[0] == self.CATEGORY:
                            item[self.ITEM_ANPA_CATEGORY] = [{'qcode': line[1]}]
                            continue
                        if line[0] == self.FORMAT:
                            if line[1] == self.TEXT:
                                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                                continue
                            if line[1] == self.TABULAR:
                                item[FORMAT] = FORMATS.PRESERVED
                                continue
                            continue
                        if line[0] == self.GENRE:
                            genre = line[1:-1]
                            if genre:
                                genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre')
                                item['genre'] = [x for x in genre_map.get('items', []) if
                                                 x['qcode'] == genre and x['is_active']]
                            continue
                        if line[0] == self.IPTC:
                            iptc_code = line[1:-1]
                            if iptc_code.isdigit():
                                item[self.ITEM_SUBJECT] = [{'qcode': iptc_code, 'name': subject_codes[iptc_code]}]
                            continue
                        header = False
                        body = True
                        item['body_html'] = line
                    else:
                        if self.END_OF_MESSAGE in line:
                            break
                        if body:
                            item['body_html'] = item.get('body_html', '') + line
                if item.get(FORMAT) == FORMATS.PRESERVED:
                    item['body_html'] = '<pre>' + html.escape(item['body_html']) + '</pre>'

            return self.post_process_item(item, provider)

        except Exception as ex:
            raise AAPParserError.ZCZCParserError(exception=ex, provider=provider)
Esempio n. 5
0
            item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:])

            # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to
            # h
            if (item.get(self.ITEM_SLUGLINE, '') + item.get(self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]
            # Another exception
            if 'NZ/AUST FIELDS' in item.get('body_html', ''):
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]

            # if the item has been marked as convert to HTML then we need to use the racing reformat macro
            # to convert it.
            if lines[0] and lines[0].find('HH ') != -1:
                racing_reformat_macro(item)

            genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre')
            if genre_map:
                item['genre'] = [x for x in genre_map.get('items', []) if
                                 x['qcode'] == 'Racing Data' and x['is_active']]
            return item

        except Exception as ex:
            logger.exception(ex)


try:
    register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
Esempio n. 6
0
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]
            # Another exception
            if 'NZ/AUST FIELDS' in item.get('body_html', ''):
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]

            # if the item has been marked as convert to HTML then we need to use the racing reformat macro
            # to convert it.
            if lines[0] and lines[0].find('HH ') != -1:
                racing_reformat_macro(item)

            genre_map = get_resource_service('vocabularies').find_one(
                req=None, _id='genre')
            if genre_map:
                item['genre'] = [
                    x for x in genre_map.get('items', [])
                    if x['qcode'] == 'Racing Data' and x['is_active']
                ]
            return item

        except Exception as ex:
            logger.exception(ex)


try:
    register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser())
except AlreadyExistsError:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.ZCZCParserError().get_error_description())
Esempio n. 7
0
            'qcode': '04000000',
            'name': subject_codes['04000000']
        }]
        item[FORMAT] = FORMATS.HTML

    def datetime(self, string):
        """
        Convert the date string parsed from the source file to a datetime, assumes that the
        time is local to Sydney Australia
        :param string:
        :return:
        """
        # 06 June 2016 14:00:00
        try:
            local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S')
        except ValueError:
            local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S')

        local_tz = pytz.timezone('Australia/Sydney')
        aus_dt = local_tz.localize(local_dt, is_dst=None)
        return aus_dt.astimezone(pytz.utc)


try:
    register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser())
except AlreadyExistsError:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.NewsBitesParserError().get_error_description())
Esempio n. 8
0
        :param dict item: The item where the data will be stored
        :param str header: The header of the file
        """
        source = 'anpa_take_key'
        for line in header.split('\n'):
            if line.lower().startswith('media release'):
                break

            if source not in item:
                item[source] = line
            else:
                item[source] += line

        # Clean up the header entries
        item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n',
                                                                  '').strip()
        item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '')
        item['slugline'] = 'AAP Medianet'
        self._truncate_headers(item)


try:
    register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser())
except AlreadyExistsError:
    pass

register_feeding_service_error(
    'file',
    AAPParserError.AsiaNetParserError().get_error_description())
Esempio n. 9
0
    def parse(self, filename, provider=None):
        """
        Attempt to parse the file and return the item
        :param filename:
        :param provider:
        :return:
        """
        try:
            with open(filename, 'rb') as f:
                lines = [line for line in f]
                item = {
                    'guid': filename + '-' + str(uuid.uuid4()),
                    'urgency': 5,
                    'pubstatus': 'usable',
                    'versioncreated': utcnow(),
                    ITEM_TYPE: CONTENT_TYPE.TEXT,
                    FORMAT: FORMATS.PRESERVED
                }
                m = re.match(
                    b'\x01(.*)' + b'\x1f(.*)' + b'\x1f([Y|N])' +
                    b'\x1f([Y|N])' + b'\x1f(.*)' +
                    b'\x1f(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)',
                    lines[0],
                    flags=re.I)
                if m:
                    state = m.group(5).decode('ascii')
                    item['slugline'] = titlecase(
                        m.group(1).decode('ascii')) + ' Gallop'
                    item['anpa_take_key'] = ('Result ' if '-' not in m.group(2).decode('ascii') else 'Results ') + \
                        m.group(2).decode('ascii') + ' ' + self.CityMap.get(state, '')
                    correction = m.group(3).decode('ascii')
                    abandoned = m.group(4).decode('ascii')
                    day_of_week = m.group(6).decode('ascii')
                    item['headline'] = item.get('slugline', '') + ' ' + item.get('anpa_take_key', '') + ' ' + \
                        day_of_week

                    # if abandoned then the city string might get shortened
                    if abandoned == 'Y':
                        city = self.CityMap.get(state, '')
                        if state.upper() in set(['NSW', 'TAS', 'NT', 'WA']):
                            city = self.CityMap.get(state, '')[:4]
                        if state.upper() in set(['VIC', 'QLD', 'SA']):
                            city = self.CityMap.get(state, '')[:5]
                        # append the city to the take key
                        item['anpa_take_key'] = ('Result ' if '-' not in m.group(2).decode('ascii') else 'Results ') + \
                            m.group(2).decode('ascii') + ' ' + city
                        item['headline'] = item.get('slugline', '') + ' ' + item.get('anpa_take_key', '') + ' ' + \
                            day_of_week

                    if correction == 'Y':
                        item['headline'] = 'RPTG CRTG ' + item.get(
                            'headline', '')
                    else:
                        if abandoned == 'Y':
                            item['anpa_take_key'] = item.get(
                                'anpa_take_key', '') + ' ABANDONED'
                            item['headline'] = item.get('headline',
                                                        '') + ' ABANDONED'
                else:
                    raise AAPParserError.PDAResulstParserError()

                item['body_html'] = '<pre>' + b'\n'.join(
                    lines[1:]).decode('ascii') + '</pre>'
                # remove the sign off as recieved, it will get put back on when published
                if item.get('body_html', '').find('AAP RESULTS'):
                    item['body_html'] = item.get('body_html', '').replace(
                        'AAP RESULTS', '')
                    item['sign_off'] = 'RESULTS'

                item['subject'] = [{'qcode': '15030001'}]
                item['anpa_category'] = [{'qcode': 'r'}]
                genre_map = superdesk.get_resource_service(
                    'vocabularies').find_one(req=None, _id='genre')
                if genre_map:
                    item['genre'] = [
                        x for x in genre_map.get('items', [])
                        if x['qcode'] == 'Results (sport)' and x['is_active']
                    ]
                self.truncate_fields(item)
                return item
        except Exception as ex:
            logging.exception(ex)