Exemple #1
0
def init_app(app):
    # register new parsers
    register_feed_parser(BusinessWireParser.NAME, BusinessWireParser())
    register_feed_parser(GlobeNewswireParser.NAME, GlobeNewswireParser())

    # override core parsers
    registered_feed_parsers[CP_APMediaFeedParser.NAME] = CP_APMediaFeedParser()
        item['urgency'] = 5
        item['pubstatus'] = 'usable'
        item['anpa_category'] = [{'qcode': 'e'}]
        item['subject'] = [{'qcode': '01000000', 'name': 'arts, culture and entertainment'}]

    def parse_news_management(self, item, entry):
        news_mgmt_el = entry.find(self.qname('NewsManagement', self.WENN_NM_NS))
        if news_mgmt_el is not None:
            item['firstcreated'] = self.datetime(self.get_elem_content(
                news_mgmt_el.find(self.qname('published', self.WENN_NM_NS))))
            item['versioncreated'] = self.datetime(self.get_elem_content(
                news_mgmt_el.find(self.qname('updated', self.WENN_NM_NS))))
            item['guid'] = self.get_elem_content(
                news_mgmt_el.find(self.qname('original_article_id', self.WENN_NM_NS)))

    def parse_content_management(self, item, entry):
        content_mgmt_el = entry.find(self.qname('ContentMetadata', self.WENN_CM_NS))
        if content_mgmt_el is not None:
            item['headline'] = self.get_elem_content(content_mgmt_el.find(self.qname('title', self.WENN_CM_NS)))
            item['abstract'] = self.get_elem_content(
                content_mgmt_el.find(self.qname('first_line', self.WENN_CM_NS)))

    def get_elem_content(self, elem):
        return elem.text if elem is not None else ''

    def datetime(self, string):
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+00:00').replace(tzinfo=utc)


register_feed_parser(WENNFeedParser.NAME, WENNFeedParser())
Exemple #3
0
            'qcode': category,
            'name': category,
            'scheme': 'category'
        }]
        genre = "Nyheter"
        item['genre'] = [{
            'qcode': genre,
            'name': genre,
            'scheme': 'genre_custom'
        }]
        xhtml = [html.escape(article['biography']).replace('\n', '<br/>\n')]
        if photo_url is not None:
            label = "photo"
            xhtml.append('<a href="{url}">{label}</a>'.format(
                url=html.escape(photo_url), label=label))
        item['body_html'] = '<p>{}</p>'.format('\n<br/>\n'.join(xhtml))
        item['ednote'] = (
            "Kilder: \n" + article['further sources'] + '\n\n' +
            "Fødested: {}\n".format(article['birth place']) +
            "Sendt inn av: {}\n".format(article['author']) +
            "Godkjent: {}\n".format("Ja" if article['permission'] else "Nei") +
            "Epost: {}\n".format(article['email']) + "Tlf: {}").format(
                article['phone'])
        item['versioncreated'] = datetime.strptime(article['DateCreated'],
                                                   DATETIME_FORMAT)
        item['sign_off'] = '*****@*****.**'
        return item


register_feed_parser(WufooFeedParser.NAME, WufooFeedParser())
Exemple #4
0
                                                                    **query)
            if user:
                return user.get('_id')
        raise SkipValue()

    def get_task(self, tree):
        desk_name = tree.find('head/meta[@name="aap-desk"]')
        if desk_name is not None:
            desk = superdesk.get_resource_service('desks').find_one(
                req=None, name=desk_name.get('content'))
            if desk:
                task = {'desk': desk.get('_id')}
                stage_name = tree.find('head/meta[@name="aap-stage"]')
                if stage_name is not None:
                    lookup = {
                        '$and': [{
                            'name': stage_name.get('content')
                        }, {
                            'desk': str(desk.get('_id'))
                        }]
                    }
                    stages = superdesk.get_resource_service('stages').get(
                        req=None, lookup=lookup)
                    if stages is not None and stages.count() == 1:
                        task['stage'] = stages[0].get('_id')
                return task
        raise SkipValue()


register_feed_parser(NITFFeedParser.NAME, NITFFeedParser())
Exemple #5
0
            item['byline'] = element.text

        # headline
        element = newslines_el.find('HeadLine')
        if element is not None and element.text:
            item['headline'] = element.text.strip()

        # copyrightholder
        element = newslines_el.find('CopyrightLine')
        if element is not None and element.text:
            item['copyrightholder'] = element.text

        # line_type
        element = newslines_el.find('NewsLine/NewsLineType')
        if element is not None and element.get('FormalName'):
            item['line_type'] = element.get('FormalName')

        # line_text
        element = newslines_el.find('NewsLine/NewsLineText')
        if element is not None and element.text:
            item['line_text'] = element.text

        # keywords
        for element in newslines_el.findall('KeywordLine'):
            if element is not None and element.text:
                item.setdefault('keywords', []).append(element.text)


register_feed_parser(BelgaTipNewsMLOneFeedParser.NAME,
                     BelgaTipNewsMLOneFeedParser())
Exemple #6
0
        # Now need to append the issue time
        item['anpa_take_key'] = item['anpa_take_key'] + ' ' + time[0]

    def _set_headline(self, item, lines, time):
        city_code = lines[0][2:3]
        item['headline'] = item['slugline'] + ' ' + self.city_code_map.get(city_code, {}).get('state', '') +\
            ': Issued ' + time[0] + ', ' + time[1]

    def parse(self, filename, provider=None):
        try:
            with open(filename, 'r', encoding='latin-1') as f:
                lines = f.readlines()
                item = {}
                time_date = self._get_time(lines)

                self.set_item_defaults(item, filename)
                self._set_slugline(item, lines, provider)
                self._set_take_key(item, lines, time_date)
                self._set_headline(item, lines, time_date)
                item['body_html'] = '<pre>' + ''.join(lines[1:]) + '</pre>'
            return item
        except Exception as ex:
            logging.exception(ex)


try:
    register_feed_parser(BOMParser.NAME, BOMParser())
except AlreadyExistsError:
    pass
Exemple #7
0
                item['authors'].append({
                    'uri': creator.get('uri'),
                    'role': role.text,
                })

    def _get_data_subject(self, subject_elt):
        qcode_parts = subject_elt.get('qcode', '').split(':')
        if len(qcode_parts
               ) == 2 and qcode_parts[0] in self.SUBJ_QCODE_PREFIXES:
            scheme = self.SUBJ_QCODE_PREFIXES[qcode_parts[0]]
            if scheme:
                # we use the given name if it exists
                name_elt = subject_elt.find(self.qname('name'))
                name = name_elt.text if name_elt is not None and name_elt.text else ""
                try:
                    name = self.getVocabulary(scheme, qcode_parts[1], name)
                    subject_data = {
                        'qcode': qcode_parts[1],
                        'name': name,
                        "scheme": scheme
                    }
                    return subject_data
                except ValueError:
                    logger.info('Subject element rejected for "{code}"'.format(
                        code=qcode_parts[1]))
        return None


register_feed_parser(BelgaDPANewsMLTwoFeedParser.NAME,
                     BelgaDPANewsMLTwoFeedParser())
Exemple #8
0
                if item.get(FORMAT) == FORMATS.PRESERVED:
                    item['body_html'] = '<pre>' + html.escape(item['body_html']) + '</pre>'

            return self.post_process_item(item, provider)

        except Exception as ex:
            raise AAPParserError.ZCZCParserError(exception=ex, provider=provider)

    def set_item_defaults(self, item, provider):
        item['urgency'] = 5
        item['pubstatus'] = 'usable'
        item['versioncreated'] = utcnow()
        item[ITEM_TYPE] = CONTENT_TYPE.TEXT
        item[FORMAT] = FORMATS.HTML

    def post_process_item(self, item, provider):
        """
        Applies the transormations required based on the provider of the content and the item it's self
        :param item:
        :param provider:
        :return: item
        """
        return item


try:
    register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
except AlreadyExistsError:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
    def set_item_defaults(self, item, filename):
        item['guid'] = filename + ':' + str(uuid.uuid4())
        item['urgency'] = 5
        item['pubstatus'] = 'usable'
        item['versioncreated'] = utcnow()
        item[ITEM_TYPE] = CONTENT_TYPE.TEXT
        item['anpa_category'] = [{'qcode': 'f'}]
        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
        item[FORMAT] = FORMATS.HTML

    def datetime(self, string):
        """
        Convert the date string parsed from the source file to a datetime, assumes that the
        time is local to Sydney Australia
        :param string:
        :return:
        """
        # 06 June 2016 14:00:00
        local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S')
        local_tz = pytz.timezone('Australia/Sydney')
        aus_dt = local_tz.localize(local_dt, is_dst=None)
        return aus_dt.astimezone(pytz.utc)


try:
    register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.NewsBitesParserError().get_error_description())
            # private editorial note
            try:
                private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if private_note:
                    item.setdefault('extra', {})['sttnote_private'] = private_note

            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)

    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content


register_feed_parser(STTNewsMLFeedParser.NAME, STTNewsMLFeedParser())
    def parse_content_set(self, tree, item):
        """Parse out the nitf like content.

        :param tree:
        :param item:
        :return: item populated with a headline and body_html
        """
        for content in tree.find(self.qname('contentSet')):
            if content.tag == self.qname('inlineXML') and content.attrib['contenttype'] == 'application/nitf+xml':
                nitf = content.find(self.qname('nitf'))
                head = nitf.find(self.qname('head'))
                item['headline'] = head.find(self.qname('title')).text
                body = nitf.find(self.qname('body'))
                content = self.parse_inline_content(body)
                item['body_html'] = content.get('content')

    def parse_inline_content(self, tree):
        body = tree.find(self.qname('body.content'))
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['content'] = "\n".join(elements)
        return content


register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
        :return:
        """
        keywords = self.get_keywords(docdata)
        return keywords[0] if len(keywords) > 0 else None

    def get_subjects(self, tree):
        """Finds all the IPTC subject tags in the passed tree and returns the parsed subjects.

        All entries will have both the name and qcode populated.

        :param tree:
        :return: a list of subject dictionaries
        """
        subjects = []
        qcodes = []  # we check qcodes to avoid duplicates
        for elem in tree.findall('head/tobject/tobject.subject[@tobject.subject.ipr="IPTC"]'):
            qcode = elem.get('tobject.subject.refnum')
            if qcode in qcodes:
                # we ignore duplicates
                continue
            else:
                qcodes.append(qcode)

            # if the subject_fields are not specified.
            if not any(c['qcode'] == qcode for c in subjects) and subject_codes.get(qcode):
                subjects.append({'name': subject_codes[qcode], 'qcode': qcode})
        return subjects


register_feed_parser(EFEFeedParser.NAME, EFEFeedParser())
                            byline = item.get('byline') or ''
                            if byline:
                                byline_prefix = ''
                                if not byline.startswith('By '):
                                    byline_prefix = 'By '
                                byline_found = elem_text.lower().startswith('{}{}'.format(byline_prefix,
                                                                                          byline).lower())
                            else:
                                byline_found = elem_text.startswith('By ')
                                if byline_found:
                                    item['byline'] = elem_text

                    # remove the byline from the body text
                    if not byline_found:
                        elements.append('<%s>%s</%s>' % (tag, elem_text, tag))

                    line_counter += 1

            content = dict()
            content['contenttype'] = tree.attrib['contenttype']
            if len(elements) > 0:
                content['content'] = "\n".join(elements)
            elif body.text:
                content['content'] = '<pre>' + body.text + '</pre>'
                content['format'] = CONTENT_TYPE.PREFORMATTED
            return content


register_feed_parser(ReutersNewsMLTwoFeedParser.NAME, ReutersNewsMLTwoFeedParser())
register_feeding_service_parser(ReutersHTTPFeedingService.NAME, ReutersNewsMLTwoFeedParser.NAME)
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license*.
from superdesk.io.registry import register_feed_parser
from .text_file import TextFileParser
from superdesk.errors import AlreadyExistsError
import time


class TickerFileParser(TextFileParser):
    """
    A simple parser for ticker files, the headline gives an indication it is an AAP ticker story. The body of the
    story is the content for the ticker.
    """

    NAME = 'AAP Ticker File'

    def parse(self, filename, provider=None):
        item = super().parse(filename, provider)
        item['headline'] = 'AAP Ticker on {}'.format(time.strftime("%A %H:%M:%S", time.localtime()))
        return item

    def post_process_item(self, item):
        item['headline'] = item['headline'][:40]
        return item


try:
    register_feed_parser(TickerFileParser.NAME, TickerFileParser())
except AlreadyExistsError:
    pass
Exemple #15
0
                                     x['qcode'] == 'Results (sport)' and x['is_active']]
                self.truncate_fields(item)
                return item
        except Exception as ex:
            logging.exception(ex)

    def truncate_fields(self, item):
        """
        Given an item it will truncate the headline and slugline to the lengths defined in the auto publish validation
        schema
        :param item:
        :return:
        """
        lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT}
        validators = get_resource_service('validators').get(req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            if 'headline' in item:
                item['headline'] = item['headline'][:max_headline_len] \
                    if len(item['headline']) > max_headline_len else item['headline']
            if 'slugline' in item:
                item['slugline'] = item['slugline'][:max_slugline_len] \
                    if len(item['slugline']) > max_slugline_len else item['slugline']


try:
    register_feed_parser(PDAResultsParser.NAME, PDAResultsParser())
except AlreadyExistsError:
    pass
        return item

    def _format_qcodes(self, items):
        return [{'name': item.get('name'), 'qcode': item.get('code')} for item in items]

    def datetime(self, string):
        try:
            return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S+0000')
        except ValueError:
            return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=utc)

    def _parse_authors(self, authors):
        return [self._parse_author(author) for author in authors]

    def _parse_author(self, author):
        parsed = {
            'name': author['name'],
            'role': author.get('role', ''),
        }

        if author.get('avatar_url'):
            parsed['avatar_url'] = author['avatar_url']

        if author.get('biography'):
            parsed['biography'] = author['biography']

        return parsed


register_feed_parser(NINJSFeedParser.NAME, NINJSFeedParser())
Exemple #17
0
                                'desks').find_one(req=None, **query)
                            if desk:
                                item['task'] = {
                                    'desk': desk.get('_id'),
                                    'stage': desk.get('incoming_stage')
                                }

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service(
                                    'vocabularies').find_one(req=None,
                                                             _id='locators')
                                place = [
                                    x for x in locator_map.get('items', [])
                                    if x['qcode'] == mail_item.get(
                                        'Place', '').upper()
                                ]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)


register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
Exemple #18
0
        # Now need to append the issue time
        item['anpa_take_key'] = item['anpa_take_key'] + ' ' + time[0]

    def _set_headline(self, item, lines, time):
        city_code = lines[0][2:3]
        item['headline'] = item['slugline'] + ' ' + self.city_code_map.get(city_code, {}).get('state', '') +\
            ': Issued ' + time[0] + ', ' + time[1]

    def parse(self, filename, provider=None):
        try:
            with open(filename, 'r', encoding='latin-1') as f:
                lines = f.readlines()
                item = {}
                time_date = self._get_time(lines)

                self.set_item_defaults(item, filename)
                self._set_slugline(item, lines, provider)
                self._set_take_key(item, lines, time_date)
                self._set_headline(item, lines, time_date)
                item['body_html'] = '<pre>' + ''.join(lines[1:]) + '</pre>'
            return item
        except Exception as ex:
            logging.exception(ex)


try:
    register_feed_parser(BOMParser.NAME, BOMParser())
except AlreadyExistsError:
    pass
Exemple #19
0
    superdesk.privilege(name='planning_event_spike',
                        label='Planning - Spike Event Items',
                        description='Ability to spike an Event')

    superdesk.privilege(name='planning_event_unspike',
                        label='Planning - Unspike Event Items',
                        description='Ability to unspike an Event')

    superdesk.intrinsic_privilege(PlanningUnlockResource.endpoint_name,
                                  method=['POST'])
    superdesk.intrinsic_privilege(EventsUnlockResource.endpoint_name,
                                  method=['POST'])

    import planning.output_formatters  # noqa

    app.client_config['max_recurrent_events'] = get_max_recurrent_events(app)


register_feeding_service(EventFileFeedingService.NAME,
                         EventFileFeedingService(),
                         EventFileFeedingService.ERRORS)
register_feeding_service(EventHTTPFeedingService.NAME,
                         EventHTTPFeedingService(),
                         EventHTTPFeedingService.ERRORS)
register_feeding_service(EventEmailFeedingService.NAME,
                         EventEmailFeedingService(),
                         EventEmailFeedingService.ERRORS)

register_feed_parser(IcsTwoFeedParser.NAME, IcsTwoFeedParser())
register_feed_parser(NTBEventXMLFeedParser.NAME, NTBEventXMLFeedParser())
        the item body and populate the dataline location, it also populates the dateline source.
        If a dateline is matched the coresponding string is removed from the article text.

        :param item:
        :return:
        """
        lines = item["body_html"].splitlines()
        if lines:
            # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
            for line_num in range(0, min(len(lines), 5)):
                city, source, the_rest = lines[line_num].partition(" (dpa) - ")
                # test if we found a candidate and ensure that the city starts the line and is not crazy long
                if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20:
                    cities = app.locators.find_cities()
                    located = [c for c in cities if c["city"].lower() == city.strip().lower()]
                    if "dateline" not in item:
                        item["dateline"] = {}
                    item["dateline"]["located"] = (
                        located[0]
                        if len(located) > 0
                        else {"city_code": city.strip(), "city": city.strip(), "tz": "UTC", "dateline": "city"}
                    )
                    item["dateline"]["source"] = "dpa"
                    item["dateline"]["text"] = city.strip()
                    item["body_html"] = item["body_html"].replace(city + source, "", 1)
                    break
        return item


register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
Exemple #21
0
                }
            elif self.missing_voc == "continue":
                return name
            else:
                raise RuntimeError("Unexpected missing_voc value: {}".format(
                    self.missing_voc))
        try:
            items = voc["items"]
        except KeyError:
            logger.warning(
                "Creating missing items for {qcode}".format(qcode=qcode))
            voc["items"] = items = []

        for item in items:
            if item["qcode"] == qcode:
                if item.get("is_active", True):
                    return item.get("name", name)
                else:
                    # the vocabulary exists but is disabled
                    raise ValueError

        items.append({"is_active": True, "name": name, "qcode": qcode})
        if create:
            vocabularies_service.post([voc])
        else:
            vocabularies_service.put(voc_id, voc)
        return name


register_feed_parser(NewsMLTwoFeedParser.NAME, NewsMLTwoFeedParser())
from aap.errors import AAPParserError
import superdesk


class ZCZCSportsResultsParser(ZCZCFeedParser):

    NAME = 'Sportsresults_zczc'

    def set_item_defaults(self, item, provider):
        super().set_item_defaults(item, provider)
        item['original_source'] = 'Sports Results'

    def post_process_item(self, item, provider):
        genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre')
        item['genre'] = [x for x in genre_map.get('items', []) if
                         x['qcode'] == 'Results (sport)' and x['is_active']]

        # If the format is HTML we need to convert the content
        if item[FORMAT] == FORMATS.HTML:
            item['body_html'] = '</p><p>'.join(item['body_html'].split('\n\n'))
            item['body_html'] = item['body_html'].replace('\n', '<br>').replace('\t', '')
            item['body_html'] = '<p>' + item['body_html'] + '</p>'
        return item


try:
    register_feed_parser(ZCZCSportsResultsParser.NAME, ZCZCSportsResultsParser())
except AlreadyExistsError:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
            item['body_html'] = '<pre>' + '\n'.join(lines[lines_to_remove:])

            # if the concatenation of the slugline and take key contain the phrase 'Brief Form' change the category to
            # h
            if (item.get(self.ITEM_SLUGLINE, '') + item.get(self.ITEM_TAKE_KEY, '')).lower().find('brief form') >= 0:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]
            # Another exception
            if 'NZ/AUST FIELDS' in item.get('body_html', ''):
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'h'}]

            # if the item has been marked as convert to HTML then we need to use the racing reformat macro
            # to convert it.
            if lines[0] and lines[0].find('HH ') != -1:
                racing_reformat_macro(item)

            genre_map = get_resource_service('vocabularies').find_one(req=None, _id='genre')
            if genre_map:
                item['genre'] = [x for x in genre_map.get('items', []) if
                                 x['qcode'] == 'Racing Data' and x['is_active']]
            return item

        except Exception as ex:
            logger.exception(ex)


try:
    register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
        :param elem:
        :return:
        """
        # Remove any leading numbers and split to list of words
        sluglineList = re.sub(r'^[\d.]+\W+', '', elem.text).split(' ')
        slugline = sluglineList[0].capitalize()
        if len(sluglineList) > 1:
            slugline = '{} {}'.format(slugline, ' '.join(sluglineList[1:]))
        return slugline

    def _get_pubstatus(self, elem):
        """Mark anything that is embargoed as usable, the editorial note still describes the embargo.

        :param elem:
        :return:
        """
        return 'usable' if elem.attrib['management-status'] == 'embargoed' else elem.attrib['management-status']

    def __init__(self):
        self.MAPPING = {'anpa_category': {'xpath': "head/meta[@name='category']", 'filter': self._category_mapping},
                        'slugline': {'xpath': 'head/title', 'filter': self._get_slugline},
                        'pubstatus': {'xpath': 'head/docdata', 'filter': self._get_pubstatus}}
        super().__init__()

    def parse(self, xml, provider=None):
        self.xml = xml
        return super().parse(xml, provider=provider)


register_feed_parser(PAFeedParser.NAME, PAFeedParser())
        date = date_parser(dateline, fuzzy=True).replace(tzinfo=utc)
        item['dateline']['date'] = date

        item['dateline']['source'] = source[:-4].strip()
        item['dateline']['text'] = dateline.strip()

        # Attempt to set the city data to the dateline.location key
        cities = app.locators.find_cities()
        for city in dateline.replace(' and ', ',').split(','):
            located = [c for c in cities if c['city'].lower() == city.strip().lower()]
            if len(located) > 0:
                item['dateline']['located'] = located[0]
                break

        if 'located' not in item['dateline']:
            city = dateline.split(',')[0]
            item['dateline']['located'] = {
                'city_code': city,
                'city': city,
                'tz': 'UTC',
                'dateline': 'city'
            }


try:
    register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser())
except AlreadyExistsError as ex:
    pass

register_feeding_service_error('file', AAPParserError.AsiaNetParserError().get_error_description())
        return dateutil.parser.parse(value)

    def _subject_filter(self, qcode):
        try:
            subject = self.subjects_map[qcode]
        except KeyError:
            return None
        else:
            if not subject.get('is_active', False):
                return None
            name = subject.get('name', '')

        return {'qcode': qcode, 'name': name, 'scheme': 'subject_custom'}

    def _publish_date_filter(self, date_string):
        local = dateutil.parser.parse(date_string)
        return local_to_utc(self.TIMEZONE, local)

    def _set_headline(self, item, value):
        if not value:
            # if there is no headline, we use first 100 chars of body
            # cf. SDNTB-481
            value = text_utils.get_text(item.get('body_html', ''), 'html')[:100]
        item['headline'] = value

    def _ednote_filter(self, ednote):
        return text_utils.get_text(ednote, lf_on_block=True).strip()


register_feed_parser(RitzauFeedParser.NAME, RitzauFeedParser())
Exemple #27
0
        keywords = self.get_keywords(docdata)
        return keywords[0] if len(keywords) > 0 else None

    def get_subjects(self, tree):
        """Finds all the IPTC subject tags in the passed tree and returns the parsed subjects.

        All entries will have both the name and qcode populated.

        :param tree:
        :return: a list of subject dictionaries
        """
        subjects = []
        qcodes = []  # we check qcodes to avoid duplicates
        for elem in tree.findall(
                'head/tobject/tobject.subject[@tobject.subject.ipr="IPTC"]'):
            qcode = elem.get('tobject.subject.refnum')
            if qcode in qcodes:
                # we ignore duplicates
                continue
            else:
                qcodes.append(qcode)

            # if the subject_fields are not specified.
            if not any(c['qcode'] == qcode
                       for c in subjects) and subject_codes.get(qcode):
                subjects.append({'name': subject_codes[qcode], 'qcode': qcode})
        return subjects


register_feed_parser(EFEFeedParser.NAME, EFEFeedParser())
                item[self.ITEM_SUBJECT] = [{'qcode': '15030001', 'name': subject_codes['15030001']}]
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'r'}]
            elif item.get(self.ITEM_SLUGLINE, '').find('AFL') != -1:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 't'}]
                item[self.ITEM_SUBJECT] = [{'qcode': '15084000', 'name': subject_codes['15084000']}]
                self._set_results_genre(item)
            else:
                item[self.ITEM_ANPA_CATEGORY] = [{'qcode': 'f'}]
                item[self.ITEM_SUBJECT] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]

            # truncate the slugline to the length defined in the validation schema
            lookup = {'act': 'auto_publish', 'type': CONTENT_TYPE.TEXT}
            validators = get_resource_service('validators').get(req=None, lookup=lookup)
            if validators.count():
                max_slugline_len = validators[0]['schema']['slugline']['maxlength']
                if 'slugline' in item:
                    item['slugline'] = item['slugline'][:max_slugline_len] \
                        if len(item['slugline']) > max_slugline_len else item['slugline']

            return item

        except Exception as ex:
            logger.exception(ex)


try:
    register_feed_parser(ZCZCPMFParser.NAME, ZCZCPMFParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
Exemple #29
0
                    if line.decode('latin-1', 'replace')\
                            .find('The following information is not for publication') != -1 \
                            or line.decode('latin-1', 'replace').find(
                                'The following information is not intended for publication') != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode('latin-1', 'replace')
                if inNote:
                    item['ednote'] += line.decode('latin-1', 'replace')
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex, provider=provider)

    def map_category(self, source_category):
        if source_category == 'x' or source_category == 'X':
            return 'i'
        else:
            return source_category


register_feed_parser(IPTC7901FeedParser.NAME, IPTC7901FeedParser())
Exemple #30
0
        item['guid'] = filename + str(uuid.uuid4())

    def parse(self, filename, provider=None):
        """
        Attempt to parse the text file and return the item
        :param filename:
        :param provider:
        :return:
        """
        try:
            with open(filename, 'r', encoding='latin-1') as f:
                lines = f.readlines()
                item = {}

                self.set_item_defaults(item, filename)
                text = StringIO()
                if len(lines) > 0:
                    item['headline'] = lines[0].strip()
                for line in lines:
                    text.write(line)
                item['body_html'] = '<pre>' + html.escape(text.getvalue()) + '</pre>'
            return item
        except Exception as ex:
            logging.exception(ex)


try:
    register_feed_parser(TextFileParser.NAME, TextFileParser())
except AlreadyExistsError as ex:
    pass
Exemple #31
0
            'subject'))  # check for sports using all ingested subjects
        item['subject'] = filter_missing_subjects(item.get('subject'))
        item['subject'].append(category)

        urgency = item.get('urgency', None)
        if urgency == 2:
            item['urgency'] = 3
        elif urgency == 4:
            item['urgency'] = 5

        set_default_service(item)

        if not item.get('headline') and item.get('body_html'):
            first_line = item.get('body_html').strip().split('\n')[0]
            parsed_headline = etree.parse_html(first_line, 'html')
            item['headline'] = etree.to_string(
                parsed_headline, method="text").strip().split('\n')[0]

        return item

    def parse_newslines(self, item, tree):
        super().parse_newslines(item, tree)
        newsline_type = tree.find(
            'NewsItem/NewsComponent/NewsLines/NewsLine/NewsLineType[@FormalName="AdvisoryLine"]'
        )
        if newsline_type is not None and newsline_type.getnext() is not None:
            item['ednote'] = newsline_type.getnext().text or ''


register_feed_parser(NTBAFPNewsMLParser.NAME, NTBAFPNewsMLParser())
        # just the video html tag with the source as the
        # video file path
        body = '''

<br>
<video controls="" height=400 width=500 src="%s"></video>
<br>

''' % video_path

        # create new file so that the file feeding service won't complain
        cmd = 'touch %s' % file_path
        cmd = shlex.split(cmd)
        output = subprocess.check_output(cmd)

        item = {}
        guid = str(uuid4())

        item = {
            'body_html': body,
            'headline': headline,
            'type': 'text',
            'versioncreated': utcnow(),
            'guid': guid
        }

        return item


register_feed_parser(SimpleVideoParser.NAME, SimpleVideoParser())
                    item[self.ITEM_SLUGLINE] = lines[1][:(lines[1].find(' Comment ') + 8)]
                    item[self.ITEM_TAKE_KEY] = lines[1][(lines[1].find(' Comment ') + 9):]
            else:
                self._scan_lines(item, lines)

            # Truncate the slugline and headline to the lengths defined on the validators if required
            lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
            validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup)
            if validators.count():
                max_slugline_len = validators[0]['schema']['slugline']['maxlength']
                max_headline_len = validators[0]['schema']['headline']['maxlength']
                if self.ITEM_SLUGLINE in item and len(item[self.ITEM_SLUGLINE]) > max_slugline_len:
                    # the overflow of the slugline is dumped in the take key
                    item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_SLUGLINE)[max_slugline_len:]
                    item[self.ITEM_SLUGLINE] = item[self.ITEM_SLUGLINE][:max_slugline_len]
                if self.ITEM_HEADLINE in item:
                    item[self.ITEM_HEADLINE] = item[self.ITEM_HEADLINE][:max_headline_len] \
                        if len(item[self.ITEM_HEADLINE]) > max_headline_len else item[self.ITEM_HEADLINE]

            return item

        except Exception as ex:
            logger.exception(ex)


try:
    register_feed_parser(ZCZCRacingParser.NAME, ZCZCRacingParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
Exemple #34
0
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)}
                            desk = superdesk.get_resource_service('desks').find_one(
                                req=None, **query)
                            if desk:
                                item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')}

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None,
                                                                                                      _id='locators')
                                place = [x for x in locator_map.get('items', []) if
                                         x['qcode'] == mail_item.get('Place', '').upper()]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)


register_feed_parser(EMailRFC822FeedParser.NAME, EMailRFC822FeedParser())
                                             content_type, rendition_spec, url_for_media)
            item['renditions'] = renditions

        try:
            date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED]
        except KeyError:
            pass
        else:
            # we format proper ISO 8601 date so we can parse it with dateutil
            datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4],
                                                                 date_created[4:6],
                                                                 date_created[6:8],
                                                                 time_created[0:2],
                                                                 time_created[2:4],
                                                                 time_created[4:6],
                                                                 time_created[6],
                                                                 time_created[7:9],
                                                                 time_created[9:])
            item['firstcreated'] = dateutil.parser.parse(datetime_created)

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item


register_feed_parser(ImageIPTCFeedParser.NAME, ImageIPTCFeedParser())
Exemple #36
0
                elt.tag = 'blockquote'
            elif tag == 'MELLIS':
                elt.tag = 'h2'
            elif tag == 'FRAGA':
                elt.tag = 'p'
            elif tag == 'SVAR':
                elt.tag = 'p'
            elif tag == 'UL':
                elt.tag = 'ul'
            elif tag == 'LI':
                elt.tag = 'li'
            elif tag == 'TABELL':
                elt.tag = 'table'
            elif tag == 'TH':
                elt.tag = 'th'
            elif tag == 'TR':
                elt.tag = 'tr'
            elif tag == 'TD':
                elt.tag = 'td'
            else:
                logger.warning('unknown tag: {tag}'.format(tag=tag))
                elt.tag = 'p'

        div_elt = etree.Element('div')
        div_elt[:] = body_elt[:]
        contents = [etree.tostring(e, encoding='unicode', method='html') for e in div_elt]
        return {'content': '\n'.join(contents)}


register_feed_parser(TTNewsMLFeedParser.NAME, TTNewsMLFeedParser())
Exemple #37
0
    def get_datetime(self, value):
        return dateutil.parser.parse(value)

    def _subject_filter(self, qcode):
        try:
            subject = self.subjects_map[qcode]
        except KeyError:
            return None
        else:
            if not subject.get('is_active', False):
                return None
            name = subject.get('name', '')

        return {'qcode': qcode, 'name': name, 'scheme': 'subject_custom'}

    def _publish_date_filter(self, date_string):
        dt = dateutil.parser.parse(date_string)
        return dt.replace(tzinfo=timezone('CET'))

    def _set_headline(self, item, value):
        if not value:
            # if there is no headline, we use first 100 chars of body
            # cf. SDNTB-481
            value = text_utils.get_text(item.get('body_html', ''),
                                        'html')[:100]
        item['headline'] = value


register_feed_parser(RitzauFeedParser.NAME, RitzauFeedParser())
        if lines:
            # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
            for line_num in range(0, min(len(lines), 5)):
                city, source, the_rest = lines[line_num].partition(' (dpa) - ')
                # test if we found a candidate and ensure that the city starts the line and is not crazy long
                if source and lines[line_num].find(city) == 0 and len(
                        city.strip()) < 20:
                    cities = app.locators.find_cities()
                    located = [
                        c for c in cities
                        if c['city'].lower() == city.strip().lower()
                    ]
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    item['dateline']['located'] = located[0] if len(
                        located) > 0 else {
                            'city_code': city.strip(),
                            'city': city.strip(),
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = 'dpa'
                    item['dateline']['text'] = city.strip()
                    item['body_html'] = item['body_html'].replace(
                        city + source, '', 1)
                    break
        return item


register_feed_parser(DPAIPTC7901FeedParser.NAME, DPAIPTC7901FeedParser())
        localities = [l for l in self.localityHierarchy if stadium.raw.get('address', {}).get(l)]
        areas = [a for a in self.areaHierarchy if stadium.raw.get('address', {}).get(a)]
        line = stadium.raw.get('address', {}).get('house_number', '')
        line = stadium.raw.get('address', {}).get('road', '') if line == '' else \
            line + ' ' + stadium.raw.get('address', {}).get('road', '')
        location['address'] = {
            'locality': stadium.raw.get('address', {}).get(localities[0], '') if len(localities) > 0 else '',
            'area': stadium.raw.get('address', {}).get(areas[0], '') if len(areas) > 0 else '',
            'country': stadium.raw.get('address', {}).get('country', ''),
            'postal_code': stadium.raw.get('address', {}).get('postcode', ''),
            'external': {'nominatim': stadium.raw},
            'line': [line]
        }
        location['name'] = stadiums[0].raw.get('address', {}).get(stadiums[0].raw.get('type', 'stadium'), '')
        ret = locations_service.post([location])
        location = locations_service.find_one(req=None, _id=ret[0])
        item['location'] = [{
            'name': location.get('name', location.get('name')),
            'address': {
                'line': location.get('address', {}).get('line', []),
                'area': location.get('address', {}).get('area', ''),
                'locality': location.get('address', {}).get('locality', ''),
                'postal_code': location.get('address', {}).get('postal_code', ''),
                'country': location.get('address', {}).get('country', ''),
            },
            'qcode': location.get('guid')
        }]


register_feed_parser(AAPSportsFixturesParser.NAME, AAPSportsFixturesParser())
            stadium.raw.get('address', {}).get(areas[0], '')
            if len(areas) > 0 else '',
            'country':
            stadium.raw.get('address', {}).get('country', ''),
            'postal_code':
            stadium.raw.get('address', {}).get('postcode', ''),
            'external': {
                'nominatim': stadium.raw
            },
            'line': [line]
        }
        location['name'] = stadiums[0].raw.get('address', {}).get(
            stadiums[0].raw.get('type', 'stadium'), '')
        ret = locations_service.post([location])
        location = locations_service.find_one(req=None, _id=ret[0])
        item['location'] = [{
            'name': location.get('name', location.get('name')),
            'address': {
                'line': location.get('address', {}).get('line', []),
                'area': location.get('address', {}).get('area', ''),
                'locality': location.get('address', {}).get('locality', ''),
                'postal_code': location.get('address',
                                            {}).get('postal_code', ''),
                'country': location.get('address', {}).get('country', ''),
            },
            'qcode': location.get('guid')
        }]


register_feed_parser(AAPSportsFixturesParser.NAME, AAPSportsFixturesParser())
        if json.get('embargotime'):
            main['embargo'] = json['embargotime']

        main['type'] = self._convert_type(json['type'])
        return main

    def _parse_date(self, string):
        """Attempts to parse BBC ninjs time in format YYYY-MM-DDTHH:MM:SS
        :param string:
        :return: datetime
        """
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%S')

    def _convert_type(self, content_type):
        """Attempts to convert BBC's types to standard ninjs types
        :param content_type:
        :return:
        """
        if content_type == 'image':
            return CONTENT_TYPE.PICTURE
        if content_type == 'story' or content_type == 'advisory':
            return CONTENT_TYPE.TEXT

        logger.error("could not find content type ({}), defaulting to text".format(content_type))

        return CONTENT_TYPE.TEXT


register_feed_parser(BBCNINJSFeedParser.NAME, BBCNINJSFeedParser())
        :param dict item: The item where the data will be stored
        :param str header: The header of the file
        """
        source = 'anpa_take_key'
        for line in header.split('\n'):
            if line.lower().startswith('media release'):
                break

            if source not in item:
                item[source] = line
            else:
                item[source] += line

        # Clean up the header entries
        item['anpa_take_key'] = item['anpa_take_key'][8:].replace('\n',
                                                                  '').strip()
        item['headline'] = 'Media Release: ' + item.get('anpa_take_key', '')
        item['slugline'] = 'AAP Medianet'
        self._truncate_headers(item)


try:
    register_feed_parser(AsiaNetFeedParser.NAME, AsiaNetFeedParser())
except AlreadyExistsError:
    pass

register_feeding_service_error(
    'file',
    AAPParserError.AsiaNetParserError().get_error_description())
            'qcode': '04000000',
            'name': subject_codes['04000000']
        }]
        item[FORMAT] = FORMATS.HTML

    def datetime(self, string):
        """
        Convert the date string parsed from the source file to a datetime, assumes that the
        time is local to Sydney Australia
        :param string:
        :return:
        """
        # 06 June 2016 14:00:00
        try:
            local_dt = datetime.datetime.strptime(string, '%d %B %Y %H:%M:%S')
        except ValueError:
            local_dt = datetime.datetime.strptime(string, '%d %b %Y %H:%M:%S')

        local_tz = pytz.timezone('Australia/Sydney')
        aus_dt = local_tz.localize(local_dt, is_dst=None)
        return aus_dt.astimezone(pytz.utc)


try:
    register_feed_parser(NewsBitesFeedParser.NAME, NewsBitesFeedParser())
except AlreadyExistsError:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.NewsBitesParserError().get_error_description())
Exemple #44
0
        place_strs = item.pop('place').split(' ')
        for place in place_strs:
            if place in self.place_map:
                replace = [
                    x for x in locator_map.get('items', [])
                    if x['qcode'] == self.place_map.get(place, '').upper()
                ]
                if replace is not None:
                    item[self.ITEM_PLACE] = replace

            if place in self.subject_map:
                if item.get(self.ITEM_SUBJECT) is None:
                    item[self.ITEM_SUBJECT] = []
                item['subject'].append({
                    'qcode':
                    self.subject_map.get(place),
                    'name':
                    subject_codes[self.subject_map.get(place)]
                })

        return item


try:
    register_feed_parser(ZCZCMedianetParser.NAME, ZCZCMedianetParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.ZCZCParserError().get_error_description())
                    'qcode': qcode,
                    'name': sport,
                    'scheme': 'subject_custom'
                })

                service = {'qcode': SERVICE_QCODE, 'name': self.service_name}

                item = {
                    'guid': event['uid'],
                    ITEM_TYPE: CONTENT_TYPE.EVENT,
                    'dates': {
                        'start': event_start,
                        'end': event_end,
                        'tz': ''
                    },
                    'name': name,
                    'slugline': sport,
                    'subject': subject,
                    'anpa_category': [service],
                    'calendars': [self.calendar_item],
                    'firstcreated': utcnow(),
                    'versioncreated': utcnow()
                }
                items.append(item)
            return items
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)


register_feed_parser(NTBNIFSFeedParser.NAME, NTBNIFSFeedParser())
Exemple #46
0
            item['body_html'] = '<p>{}</p>'.format(
                re.sub('<p>   ', '<p>', item.get('body_html', '').replace('\n\n', '\n').replace('\n', '</p><p>')))
            if self.ITEM_PLACE in item:
                if item[self.ITEM_PLACE]:
                    item['headline'] = '{}: {}'.format(item[self.ITEM_PLACE], item.get(self.ITEM_HEADLINE, ''))
                locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                place = [x for x in locator_map.get('items', []) if
                         x['qcode'] == item.get(self.ITEM_PLACE, '').upper()]
                if place is not None:
                    item[self.ITEM_PLACE] = place
                else:
                    item.pop(self.ITEM_PLACE)
            genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre')
            item['genre'] = [x for x in genre_map.get('items', []) if
                             x['qcode'] == 'Broadcast Script' and x['is_active']]

            # Remove the attribution
            item['body_html'] = item.get('body_html', '').replace('<p>AAP RTV</p>', '')
            item['sign_off'] = 'RTV'
        except Exception as ex:
            logger.exception(ex)

        return item


try:
    register_feed_parser(ZCZCBOBParser.NAME, ZCZCBOBParser())
except AlreadyExistsError as ex:
    pass
register_feeding_service_error('file', AAPParserError.ZCZCParserError().get_error_description())
        return True

    def format_subjects(self, subjects):
        """Map the ingested Subject Codes to their corresponding names as per IPTC Specification.

        :param subjects: list of dicts where each dict gives the category the article is mapped to.
        :type subjects: list
        :returns [{"qcode": "01001000", "name": "archaeology"}, {"qcode": "01002000", "name": "architecture"}]
        :rtype list
        """

        formatted_subjects = []

        def is_not_formatted(qcode):
            for formatted_subject in formatted_subjects:
                if formatted_subject['qcode'] == qcode:
                    return False

            return True

        for subject in subjects:
            formal_name = subject.get('FormalName')
            if formal_name and is_not_formatted(formal_name):
                formatted_subjects.append({'qcode': formal_name, 'name': subject_codes.get(formal_name, '')})

        return formatted_subjects


register_feed_parser(NewsMLOneFeedParser.NAME, NewsMLOneFeedParser())
    def parse(self, filename, provider=None):
        default_item = self._set_default_item()
        items = []
        with open(filename, 'r', encoding='UTF-8') as f:
            csv_reader = csv.reader(f)
            for row in list(csv_reader)[1:]:
                if not len(row):
                    continue
                item = deepcopy(default_item)
                item[GUID_FIELD] = ('urn:www.abs.gov.au:' +
                                    row[0].split(' ')[0] +
                                    row[0].split(',')[-1]).replace(
                                        '/', '-').replace(' ', '-')
                if row[5] == 'true':
                    start = datetime.strptime('{} 11:30'.format(row[1]),
                                              '%d/%m/%Y %H:%M')
                    end = datetime.strptime('{} 11:30'.format(row[1]),
                                            '%d/%m/%Y %H:%M')
                    item['dates'] = {
                        'start': local_to_utc(config.DEFAULT_TIMEZONE, start),
                        'end': local_to_utc(config.DEFAULT_TIMEZONE, end),
                        'tz': config.DEFAULT_TIMEZONE,
                    }
                    item['name'] = ' '.join(row[0].split(' ')[1:])
                    item['definition_short'] = row[0]
                    items.append(item)
        return items


register_feed_parser(ABSCalendarCSVParser.NAME, ABSCalendarCSVParser())
            html = etree.tostring(content, encoding="unicode")

        item['body_html'] = html

    def attachments_hook(self, item, attachments):
        """Attachment are parsed at the end

        if it's the first image found, it's used as feature media
        else it's used as embed and put at the end of body_html
        """
        for url in attachments:
            try:
                key, media_data = self._add_image(item, url)
            except Exception as e:
                logger.error(e)
                continue
            if key == 'featuremedia':
                # no need to embed the image for featuremedia
                continue
            embed_start = "<!--" + embed_TPL.format('START', key) + "-->"
            embed_end = "<!--" + embed_TPL.format('END', key) + "-->"
            new_url = media_data['renditions']['original']['href']
            img = '<img src={src} height="{height}" width="{width}">'.format(
                src=quoteattr(new_url),
                height=media_data['renditions']['original']['height'],
                width=media_data['renditions']['original']['width'])
            item['body_html'] += '<div>' + embed_start + img + embed_end + '</div>'


register_feed_parser(WPWXRFeedParser.NAME, WPWXRFeedParser())
Exemple #50
0
            return self.post_process_item(item, provider)

        except Exception as ex:
            raise AAPParserError.ZCZCParserError(exception=ex,
                                                 provider=provider)

    def set_item_defaults(self, item, provider):
        item['urgency'] = 5
        item['pubstatus'] = 'usable'
        item['versioncreated'] = utcnow()
        item[ITEM_TYPE] = CONTENT_TYPE.TEXT
        item[FORMAT] = FORMATS.HTML

    def post_process_item(self, item, provider):
        """
        Applies the transormations required based on the provider of the content and the item it's self
        :param item:
        :param provider:
        :return: item
        """
        return item


try:
    register_feed_parser(ZCZCFeedParser.NAME, ZCZCFeedParser())
except AlreadyExistsError:
    pass
register_feeding_service_error(
    'file',
    AAPParserError.ZCZCParserError().get_error_description())
Exemple #51
0
        item['body_html'] = html

    def attachments_hook(self, item, attachments):
        """Attachment are parsed at the end

        if it's the first image found, it's used as feature media
        else it's used as embed and put at the end of body_html
        """
        for url in attachments:
            try:
                key, media_data = self._add_image(item, url)
            except Exception as e:
                logger.error(e)
                continue
            if key == 'featuremedia':
                # no need to embed the image for featuremedia
                continue
            embed_start = "<!--" + embed_TPL.format('START', key) + "-->"
            embed_end = "<!--" + embed_TPL.format('END', key) + "-->"
            _id = media_data['_id']
            new_url = url_for_media(_id)
            img = '<img src={src} height="{height}" width="{width}">'.format(
                src=quoteattr(new_url),
                height=media_data['renditions']['original']['height'],
                width=media_data['renditions']['original']['width'])
            item[
                'body_html'] += '<div>' + embed_start + img + embed_end + '</div>'


register_feed_parser(WPWXRFeedParser.NAME, WPWXRFeedParser())
Exemple #52
0
# at https://www.sourcefabric.org/superdesk/license

from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser
from superdesk.io.registry import register_feed_parser
from superdesk.utc import utcnow
from pytz import utc


class AFPNewsMLOneFeedParser(NewsMLOneFeedParser):
    """AFP specific NewsML parser.

    Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format,
    but the firstcreated and versioncreated times are localised.
    """

    NAME = "afpnewsml12"

    label = "AFP News ML 1.2 Parser"

    def parse(self, xml, provider=None):
        item = super().parse(xml, provider)
        item["firstcreated"] = utc.localize(
            item["firstcreated"]) if item.get("firstcreated") else utcnow()
        item["versioncreated"] = utc.localize(
            item["versioncreated"]) if item.get(
                "versioncreated") else utcnow()
        return item


register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())
Exemple #53
0
        item['priority'] = item['urgency']
        item['byline'] = ', '.join(article.get('authors', []))

        for category_qcode in [
                c for c in article.get('categories', [])
                if c in self._vocabularies['anp_genres']
        ]:
            item.setdefault('subject', []).append({
                'name':
                self._vocabularies['anp_genres'][category_qcode]['name'],
                'qcode':
                category_qcode,
                'scheme':
                'anp_genres'
            })

        for keyword in article.get('keywords', []):
            item.setdefault('keywords', []).append(keyword)

        # fetch media if item contains a media_link
        if article.get('media_link'):
            self._add_featuremedia(provider, item, article['media_link'])

        return item

    def _parse_date(self, string):
        return datetime.datetime.strptime(string, '%Y-%m-%dT%H:%M:%SZ')


register_feed_parser(ANPNewsApiFeedParser.NAME, ANPNewsApiFeedParser())
Exemple #54
0
        return item

    def parse_date_time(self, date, time):
        if not date or not time:
            return

        datetime_string = '{}T{}'.format(date, time)
        try:
            return datetime.strptime(datetime_string, self.DATETIME_FORMAT)
        except ValueError:
            try:
                arrow.get(datetime_string).datetime
            except ValueError:
                return

    def parse_meta(self, item, metadata):
        datetime_created = self.parse_date_time(metadata.get(TAG.DATE_CREATED), metadata.get(TAG.TIME_CREATED))
        if datetime_created:
            item['firstcreated'] = datetime_created

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in self.IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item


register_feed_parser(ImageIPTCFeedParser.NAME, ImageIPTCFeedParser())
    Feed Parser which can parse STT variant of NewsML
    """

    NAME = 'ntb_sttnewsml'
    label = "NTB STT NewsML"

    def can_parse(self, xml):
        return xml.tag.endswith('newsItem')

    def parse(self, xml, provider=None):
        try:
            item = super().parse(xml, provider)[0]
            # SDNTB-462 requires that slugline is removed
            del item['slugline']
            sport = bool(
                self.root.xpath(
                    '//iptc:subject[@type="cpnat:abstract" and @qcode="sttsubj:15000000"]',
                    namespaces={'iptc': IPTC_NS}))
            cat = utils.SPORT_CATEGORY if sport else utils.DEFAULT_CATEGORY
            category = {'qcode': cat, 'name': cat, 'scheme': 'category'}
            item['subject'] = utils.filter_missing_subjects(
                item.get('subject'))
            item['subject'].append(category)
            utils.set_default_service(item)
            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)


register_feed_parser(NTBSTTNewsMLFeedParser.NAME, NTBSTTNewsMLFeedParser())
Exemple #56
0
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)

    def _parse_ednote(self, header_lines, item):
        for line in header_lines:
            m = re.search("EDITOR'S NOTE _(.*)", line)
            if m:
                item['ednote'] = m.group(1).strip()

    def map_priority(self, source_priority):
        mapping = {
            'f': Priority.Flash.value,
            'u': Priority.Urgent.value,
            'b': Priority.Three_Paragraph.value,
            'z': Priority.Ordinary.value
        }

        source_priority = source_priority.lower().strip() if isinstance(source_priority, str) else ''
        return mapping.get(source_priority, Priority.Ordinary.value)


register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
Exemple #57
0
    def parse_content_set(self, tree, item):
        """Parse out the nitf like content.

        :param tree:
        :param item:
        :return: item populated with a headline and body_html
        """
        for content in tree.find(self.qname('contentSet')):
            if content.tag == self.qname('inlineXML') and content.attrib['contenttype'] == 'application/nitf+xml':
                nitf = content.find(self.qname('nitf'))
                head = nitf.find(self.qname('head'))
                item['headline'] = head.find(self.qname('title')).text
                body = nitf.find(self.qname('body'))
                content = self.parse_inline_content(body)
                item['body_html'] = content.get('content')

    def parse_inline_content(self, tree):
        body = tree.find(self.qname('body.content'))
        elements = []
        for elem in body:
            if elem.text:
                tag = elem.tag.rsplit('}')[1]
                elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

        content = dict()
        content['content'] = "\n".join(elements)
        return content


register_feed_parser(ScoopNewsMLTwoFeedParser.NAME, ScoopNewsMLTwoFeedParser())
        item['calendars'] = [c for c in calendars.get('items', []) if c.get('qcode').lower() == 'abs statistics']

        return item

    def parse(self, filename, provider=None):
        default_item = self._set_default_item()
        items = []
        with open(filename, 'r', encoding='UTF-8') as f:
            csv_reader = csv.reader(f)
            for row in list(csv_reader)[1:]:
                if not len(row):
                    continue
                item = deepcopy(default_item)
                item[GUID_FIELD] = ('urn:www.abs.gov.au:' + row[0].split(' ')[0] +
                                    row[0].split(',')[-1]).replace('/', '-').replace(' ', '-')
                if row[5] == 'true':
                    start = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M')
                    end = datetime.strptime('{} 11:30'.format(row[1]), '%d/%m/%Y %H:%M')
                    item['dates'] = {
                        'start': local_to_utc(config.DEFAULT_TIMEZONE, start),
                        'end': local_to_utc(config.DEFAULT_TIMEZONE, end),
                        'tz': config.DEFAULT_TIMEZONE,
                    }
                    item['name'] = ' '.join(row[0].split(' ')[1:])
                    item['definition_short'] = row[0]
                    items.append(item)
        return items


register_feed_parser(ABSCalendarCSVParser.NAME, ABSCalendarCSVParser())
Exemple #59
0
                    if m:
                        item["slugline"] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)

    def _parse_ednote(self, header_lines, item):
        for line in header_lines:
            m = re.search("EDITOR'S NOTE _(.*)", line)
            if m:
                item["ednote"] = m.group(1).strip()

    def map_priority(self, source_priority):
        mapping = {
            "f": Priority.Flash.value,
            "u": Priority.Urgent.value,
            "b": Priority.Three_Paragraph.value,
            "z": Priority.Ordinary.value,
        }

        source_priority = source_priority.lower().strip() if isinstance(
            source_priority, str) else ""
        return mapping.get(source_priority, Priority.Ordinary.value)


register_feed_parser(ANPAFeedParser.NAME, ANPAFeedParser())
#
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser
from superdesk.io.registry import register_feed_parser
from superdesk.utc import utcnow
from pytz import utc


class AFPNewsMLOneFeedParser(NewsMLOneFeedParser):
    """AFP specific NewsML parser.

    Feed Parser which can parse the AFP feed basically it is in NewsML 1.2 format,
    but the firstcreated and versioncreated times are localised.
    """

    NAME = 'afpnewsml12'

    label = 'AFP News ML 1.2 Parser'

    def parse(self, xml, provider=None):
        item = super().parse(xml, provider)
        item['firstcreated'] = utc.localize(item['firstcreated']) if item.get('firstcreated') else utcnow()
        item['versioncreated'] = utc.localize(item['versioncreated']) if item.get('versioncreated') else utcnow()
        return item


register_feed_parser(AFPNewsMLOneFeedParser.NAME, AFPNewsMLOneFeedParser())