Beispiel #1
0
from superdesk.io.feed_parsers.nitf import NITFFeedParser
from superdesk.io import register_feed_parser


class PAFeedParser(NITFFeedParser):
    """
    NITF Parser extension for Press Association, it maps the category meta tag to an anpa category
    """
    NAME = 'pa_nitf'

    def _category_mapping(self, elem):
        """
        Map the category supplied by PA to a best guess anpa_category in the system
        :param elem:
        :return: anpa category list qcode
        """
        if elem.get('content') is not None:
            category = elem.get('content')[:1].upper()
            if category in {'S', 'R', 'F'}:
                return [{'qcode': 'S'}]
            if category == 'Z':
                return [{'qcode': 'V'}]
        return [{'qcode': 'I'}]

    def __init__(self):
        self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping}}
        super().__init__()

register_feed_parser(PAFeedParser.NAME, PAFeedParser())
    def parse_content_set(self, tree, item):
        """Parse out the nitf like content.

        :param tree:
        :param item:
        :return: item populated with a headline and body_html
        """
        for content in tree.find(self.qname('contentSet')):
            if content.tag == self.qname('inlineXML') and content.attrib['contenttype'] == 'application/nitf+xml':
                nitf = content.find(self.qname('nitf'))
                head = nitf.find(self.qname('head'))
                item['headline'] = head.find(self.qname('title')).text
                body = nitf.find(self.qname('body'))
                content = self.parse_inline_content(body)
                item['body_html'] = content.get('content')

    def parse_inline_content(self, tree):
        body = tree.find(self.qname('body.content'))
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['content'] = "\n".join(elements)
        return content

register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
Beispiel #3
0
                                'desks').find_one(req=None, **query)
                            if desk:
                                item['task'] = {
                                    'desk': desk.get('_id'),
                                    'stage': desk.get('incoming_stage')
                                }

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service(
                                    'vocabularies').find_one(req=None,
                                                             _id='locators')
                                place = [
                                    x for x in locator_map.get('items', [])
                                    if x['qcode'] == mail_item.get(
                                        'Place', '').upper()
                                ]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)


register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
Beispiel #4
0
                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)

    def map_priority(self, source_priority):
        mapping = {
            'f': Priority.Flash.value,
            'u': Priority.Urgent.value,
            'b': Priority.Three_Paragraph.value,
            'z': Priority.Ordinary.value
        }

        source_priority = source_priority.lower().strip() if isinstance(source_priority, str) else ''
        return mapping.get(source_priority, Priority.Ordinary.value)


register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
Beispiel #5
0
        """Parse out the nitf like content.

        :param tree:
        :param item:
        :return: item populated with a headline and body_html
        """
        for content in tree.find(self.qname('contentSet')):
            if content.tag == self.qname('inlineXML') and content.attrib[
                    'contenttype'] == 'application/nitf+xml':
                nitf = content.find(self.qname('nitf'))
                head = nitf.find(self.qname('head'))
                item['headline'] = head.find(self.qname('title')).text
                body = nitf.find(self.qname('body'))
                content = self.parse_inline_content(body)
                item['body_html'] = content.get('content')

    def parse_inline_content(self, tree):
        body = tree.find(self.qname('body.content'))
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['content'] = "\n".join(elements)
        return content


register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
Beispiel #6
0
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        content['content'] = "\n".join(elements)
        return content

    def parse_remote_content(self, tree):
        content = dict()
        content['residRef'] = tree.attrib['residref']
        content['sizeinbytes'] = int(tree.attrib.get('size', '0'))
        content['rendition'] = tree.attrib['rendition'].split(':')[1]
        content['mimetype'] = tree.attrib['contenttype']
        content['href'] = tree.attrib.get('href', None)
        return content

    def datetime(self, string):
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S.000Z')

    def get_literal_name(self, item):
        """Get name for item with fallback to literal attribute if name is not provided."""
        name = item.find(self.qname('name'))
        return name.text if name is not None else item.attrib.get('literal')


register_feed_parser(NewsMLTwoFeedParser.NAME, NewsMLTwoFeedParser())
Beispiel #7
0
                }]
            else:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}]
                item[self.ITEM_SUBJECT] = [{
                    'qcode': '04000000',
                    'name': subject_codes['04000000']
                }]
        elif provider.get('source') == 'BRA':
            # It is from the Racing system
            item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}]
            item[self.ITEM_SUBJECT] = [{
                'qcode': '15030001',
                'name': subject_codes['15030001']
            }]
            lines = item['body_html'].split('\n')
            if lines[2] and lines[2].find(':SPORT -') != -1:
                item[self.ITEM_HEADLINE] = lines[2][9:]
            elif lines[1] and lines[1].find('RACING : ') != -1:
                item[self.ITEM_HEADLINE] = lines[1][8:]
            elif lines[0] and lines[0].find('YY FORM') != -1:
                item[self.ITEM_HEADLINE] = lines[1]
            elif lines[1] and lines[1].find(':POTTED :') != -1:
                item[self.ITEM_HEADLINE] = lines[1][9:]
            else:
                item[self.ITEM_HEADLINE] = lines[2]

        return item


register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
Beispiel #8
0
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)}
                            desk = superdesk.get_resource_service('desks').find_one(
                                req=None, **query)
                            if desk:
                                item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')}

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None,
                                                                                                      _id='locators')
                                place = [x for x in locator_map.get('items', []) if
                                         x['qcode'] == mail_item.get('Place', '').upper()]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)


register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
Beispiel #9
0
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        content['content'] = "\n".join(elements)
        return content

    def parse_remote_content(self, tree):
        content = dict()
        content['residRef'] = tree.attrib['residref']
        content['sizeinbytes'] = int(tree.attrib.get('size', '0'))
        content['rendition'] = tree.attrib['rendition'].split(':')[1]
        content['mimetype'] = tree.attrib['contenttype']
        content['href'] = tree.attrib.get('href', None)
        return content

    def datetime(self, string):
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S.000Z')

    def get_literal_name(self, item):
        """Get name for item with fallback to literal attribute if name is not provided."""
        name = item.find(self.qname('name'))
        return name.text if name is not None else item.attrib.get('literal')


register_feed_parser(NewsMLTwoFeedParser.NAME, NewsMLTwoFeedParser())
Beispiel #10
0
        raise SkipValue()

    def _get_slugline(self, elem):
        """
        Capitalize the first word of the slugline (Removing any leading digits's).
        :param elem:
        :return:
        """
        # Remove any leading numbers and split to list of words
        sluglineList = re.sub('^[\d.]+\W+', '', elem.text).split(' ')
        slugline = sluglineList[0].capitalize()
        if len(sluglineList) > 1:
            slugline = '{} {}'.format(slugline, ' '.join(sluglineList[1:]))
        return slugline

    def _get_pubstatus(self, elem):
        """
        Mark anything that is embargoed as usable, the editorial note still describes the embargo
        :param elem:
        :return:
        """
        return 'usable' if elem.attrib['management-status'] == 'embargoed' else elem.attrib['management-status']

    def __init__(self):
        self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping},
                        'slugline': {'xpath': 'head/title', 'filter': self._get_slugline},
                        'pubstatus': {'xpath': 'head/docdata', 'filter': self._get_pubstatus}}
        super().__init__()

register_feed_parser(PAFeedParser.NAME, PAFeedParser())
Beispiel #11
0
            item['body_html'] = get_content(xml)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            if xml.find('body/body.head/hedline/hl1') is not None:
                item['headline'] = xml.find('body/body.head/hedline/hl1').text
            else:
                if xml.find('head/title') is not None:
                    item['headline'] = xml.find('head/title').text

            elem = xml.find('body/body.head/abstract')
            item['abstract'] = elem.text if elem is not None else ''

            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item['byline'] = get_byline(xml)

            parse_meta(xml, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)


register_feed_parser(NITFFeedParser.NAME, NITFFeedParser())
Beispiel #12
0
                            or line.decode('latin-1', 'replace').find(
                                'The following information is not intended for publication') != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode('latin-1', 'replace')
                if inNote:
                    item['ednote'] += line.decode('latin-1', 'replace')
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode('latin-1',
                                                    'replace').rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex,
                                                  provider=provider)

    def map_category(self, source_category):
        if source_category == 'x' or source_category == 'X':
            return 'i'
        else:
            return source_category


register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
Beispiel #13
0
                            "The following information is not intended for publication"
                        )
                        != -1
                    ):
                        inNote = True
                        inText = False
                        item["ednote"] = ""
                        continue
                    item["body_html"] += line.decode("latin-1", "replace")
                if inNote:
                    item["ednote"] += line.decode("latin-1", "replace")
                    continue
                if inHeader:
                    if "slugline" not in item:
                        item["slugline"] = ""
                    item["slugline"] += line.decode("latin-1", "replace").rstrip("/\r\n")
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex, provider=provider)

    def map_category(self, source_category):
        if source_category == "x" or source_category == "X":
            return "i"
        else:
            return source_category


register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
Beispiel #14
0
        if lines:
            # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
            for line_num in range(0, min(len(lines), 5)):
                city, source, the_rest = lines[line_num].partition(' (dpa) - ')
                # test if we found a candidate and ensure that the city starts the line and is not crazy long
                if source and lines[line_num].find(city) == 0 and len(
                        city.strip()) < 20:
                    cities = find_cities()
                    located = [
                        c for c in cities
                        if c['city'].lower() == city.strip().lower()
                    ]
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    item['dateline']['located'] = located[0] if len(
                        located) > 0 else {
                            'city_code': city.strip(),
                            'city': city.strip(),
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = 'dpa'
                    item['dateline']['text'] = city.strip()
                    item['body_html'] = item['body_html'].replace(
                        city + source, '', 1)
                    break
        return item


register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
Beispiel #15
0
                if item.get(self.ITEM_SLUGLINE, '').find('Trot') != -1:
                    item[self.ITEM_HEADLINE] = item.get(self.ITEM_SLUGLINE) + ' ' + item.get(self.ITEM_TAKE_KEY, '')
                    item[self.ITEM_SUBJECT] = [{'qcode': '15030000', 'name': subject_codes['15030000']}]
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}]
            elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}]
                item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}]
            else:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}]
                item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
        elif provider.get('source') == 'BRA':
            # It is from the Racing system
            item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}]
            item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}]
            lines = item['body_html'].split('\n')
            if lines[2] and lines[2].find(':SPORT -') != -1:
                item[self.ITEM_HEADLINE] = lines[2][9:]
            elif lines[1] and lines[1].find('RACING : ') != -1:
                item[self.ITEM_HEADLINE] = lines[1][8:]
            elif lines[0] and lines[0].find('YY FORM') != -1:
                item[self.ITEM_HEADLINE] = lines[1]
            elif lines[1] and lines[1].find(':POTTED :') != -1:
                item[self.ITEM_HEADLINE] = lines[1][9:]
            else:
                item[self.ITEM_HEADLINE] = lines[2]

        return item


register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
Beispiel #16
0
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser
from superdesk.io import register_feed_parser
from superdesk.utc import utcnow
from pytz import utc


class AFPNewsMLOneFeedParser(NewsMLOneFeedParser):
    """AFP specific NewsML parser.

    Feed Parser which can parse the AFP feed basicaly it is in NewsML 1.2 format,
    but the firstcreated and versioncreated times are localised.
    """

    NAME = 'afpnewsml12'

    def parse(self, xml, provider=None):
        item = super().parse(xml, provider)
        item['firstcreated'] = utc.localize(
            item['firstcreated']) if item.get('firstcreated') else utcnow()
        item['versioncreated'] = utc.localize(
            item['versioncreated']) if item.get(
                'versioncreated') else utcnow()
        return item


register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
Beispiel #17
0
    def parse_news_management(self, item, entry):
        news_mgmt_el = entry.find(self.qname('NewsManagement', self.WENN_NM_NS))
        if news_mgmt_el:
            item['firstcreated'] = self.datetime(self.get_elem_content(
                news_mgmt_el.find(self.qname('published', self.WENN_NM_NS))))
            item['versioncreated'] = self.datetime(self.get_elem_content(
                news_mgmt_el.find(self.qname('updated', self.WENN_NM_NS))))
            item['guid'] = self.get_elem_content(
                news_mgmt_el.find(self.qname('original_article_id', self.WENN_NM_NS)))

    def parse_content_management(self, item, entry):
        content_mgmt_el = entry.find(self.qname('ContentMetadata', self.WENN_CM_NS))
        if content_mgmt_el:
            item['headline'] = self.get_elem_content(content_mgmt_el.find(self.qname('title', self.WENN_CM_NS)))
            item['abstract'] = self.get_elem_content(
                content_mgmt_el.find(self.qname('first_line', self.WENN_CM_NS)))
            item['keywords'] = [element.attrib.get('value') for element in
                                content_mgmt_el.findall(self.qname('tags', self.WENN_CM_NS) + '/' +
                                                        self.qname('tag', self.WENN_CM_NS))
                                if element.attrib.get('value')]

    def get_elem_content(self, elem):
        return elem.text if elem is not None else ''

    def datetime(self, string):
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo=utc)


register_feed_parser(WENNFeedParser.NAME, WENNFeedParser())
#
# Copyright 2013, 2014 Sourcefabric z.u. and contributors.
#
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser
from superdesk.io import register_feed_parser
from superdesk.utc import utcnow
from pytz import utc


class AFPNewsMLOneFeedParser(NewsMLOneFeedParser):
    """AFP specific NewsML parser.

    Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format,
    but the firstcreated and versioncreated times are localised.
    """

    NAME = 'afpnewsml12'

    def parse(self, xml, provider=None):
        item = super().parse(xml, provider)
        item['firstcreated'] = utc.localize(item['firstcreated']) if item.get('firstcreated') else utcnow()
        item['versioncreated'] = utc.localize(item['versioncreated']) if item.get('versioncreated') else utcnow()
        return item


register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
Beispiel #19
0
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [
                            c for c in cities
                            if c['city'].lower() == city.lower()
                        ]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(
                            located) > 0 else {
                                'city_code': city,
                                'city': city,
                                'tz': 'UTC',
                                'dateline': 'city'
                            }
                        item['dateline']['source'] = item.get(
                            'original_source', 'AP')
                        item['dateline'][
                            'text'] = format_dateline_to_locmmmddsrc(
                                item['dateline']['located'],
                                get_date(item['firstcreated']),
                                source=item.get('original_source', 'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')


register_feed_parser(AP_ANPAFeedParser.NAME, AP_ANPAFeedParser())
Beispiel #20
0
        """Parse dateline from item body.

        This function attempts to parse a dateline from the first few lines of
        the item body and populate the dataline location, it also populates the dateline source.
        If a dateline is matched the coresponding string is removed from the article text.

        :param item:
        :return:
        """
        lines = item['body_html'].splitlines()
        if lines:
            # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
            for line_num in range(0, min(len(lines), 5)):
                city, source, the_rest = lines[line_num].partition(' (dpa) - ')
                # test if we found a candidate and ensure that the city starts the line and is not crazy long
                if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20:
                    cities = app.locators.find_cities()
                    located = [c for c in cities if c['city'].lower() == city.strip().lower()]
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city.strip(),
                                                                                       'city': city.strip(),
                                                                                       'tz': 'UTC', 'dateline': 'city'}
                    item['dateline']['source'] = 'dpa'
                    item['dateline']['text'] = city.strip()
                    item['body_html'] = item['body_html'].replace(city + source, '', 1)
                    break
        return item

register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())