Esempio n. 1
0
 def parse_message(self, tree, provider):
     """Parse NewsMessage."""
     items = []
     try:
         self.root = tree
         for item_set in tree.findall(self.qname("itemSet")):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 2
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 item['priority'] = header['priority']
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                        item.setdefault('word_count', get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if
                                   c.get('city', '').lower() == 'wellington']
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'), content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(par) == 0 and par.text.startswith('By '):
                                    item['byline'] = par.text.replace('By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed, remove_root_div=True)
                        locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ']

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 4
0
 def test_raise_newsmlTwoParserError(self):
     with assert_raises(ParserError) as error_context:
         try:
             ex = Exception("Testing newsmlTwoParserError")
             raise ex
         except Exception:
             raise ParserError.newsmlTwoParserError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1005)
     self.assertTrue(exception.message == "NewsML2 input could not be processed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing newsmlTwoParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1005 - NewsML2 input could not be processed: "
                      "Testing newsmlTwoParserError on channel TestProvider")
Esempio n. 5
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract
            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 6
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 # Ignore the packageItem, it has no guid
                 if 'guid' in item_tree.attrib:
                     item = self.parse_item(item_tree)
                     item['priority'] = header['priority']
                     item['anpa_category'] = [{'qcode': 'f'}]
                     item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                     item.setdefault('word_count', get_word_count(item['body_html']))
                     items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 7
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 item['priority'] = header['priority']
                 items.append(item)
         else:
             if xml.tag.endswith('newsItem') or xml.tag.endswith('packageItem'):
                 item = self.parse_item(xml)
                 item.setdefault('priority', header['priority'])
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 8
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            # populate published for newsroom archive
            item.setdefault('firstpublished', item.get('versioncreated'))

            # abstract
            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract

            # genre
            for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}):
                qcode = genre_elt.get('qcode')
                if qcode is None:
                    continue
                elif qcode.startswith('sttgenre:'):
                    qcode = qcode[9:]
                    genre_data = {'qcode': qcode}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("genre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        genre_data['name'] = name
                        item.setdefault('genre', []).append(genre_data)
                elif qcode.startswith('sttversion:'):
                    qcode = qcode[11:]
                    version_data = {'qcode': qcode, 'scheme': 'sttversion'}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("sttgenre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        version_data['name'] = name
                        item.setdefault('subject', []).append(version_data)

            # location
            for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}):
                qcode = location_elt.get("qcode")
                if not qcode or not qcode.startswith("sttlocmeta:"):
                    continue
                qcode = qcode.split(':')[-1]
                location_data = {"scheme": "sttlocmeta", "qcode": qcode}
                location_name = location_elt.find(self.qname('name'))
                if location_name is not None:
                    location_data['name'] = location_name.text
                for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']",
                                                      namespaces={'iptc': IPTC_NS}):
                    qcode = broader_elt.get('qcode')
                    if not qcode:
                        continue
                    for key, mapping in STT_LOCATION_MAP.items():
                        if qcode.startswith(key + ":"):
                            if "qcode" in mapping:
                                qcode = qcode[len(key) + 1:]
                            try:
                                name = broader_elt.find(self.qname('name')).text
                            except AttributeError:
                                name = ""
                            try:
                                name = self.getVocabulary(key, qcode, name)
                            except ValueError:
                                continue
                            else:
                                location_data[mapping["qcode"]] = qcode
                                if "name" in mapping:
                                    location_data[mapping["name"]] = name
                item.setdefault('place', []).append(location_data)

            # public editorial note
            if 'ednote' in item:
                # stt has specific roles for public and private editorial notes
                # so we remove ednote found by parent parser, as it takes first one
                # as a public note
                del item['ednote']
            try:
                ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if ednote:
                    item['ednote'] = ednote

            # private editorial note
            try:
                private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if private_note:
                    item.setdefault('extra', {})['sttnote_private'] = private_note

            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            # abstract
            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract

            # genre
            for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}):
                qcode = genre_elt.get('qcode')
                if qcode is None:
                    continue
                elif qcode.startswith('sttgenre:'):
                    qcode = qcode[9:]
                    genre_data = {'qcode': qcode}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("genre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        genre_data['name'] = name
                        item.setdefault('genre', []).append(genre_data)
                elif qcode.startswith('sttversion:'):
                    qcode = qcode[11:]
                    version_data = {'qcode': qcode, 'scheme': 'sttversion'}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("sttgenre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        version_data['name'] = name
                        item.setdefault('subject', []).append(version_data)

            # location
            for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}):
                qcode = location_elt.get("qcode")
                if not qcode or not qcode.startswith("sttlocmeta:default:"):
                    continue
                qcode = qcode[19:]
                location_data = {"scheme": "sttlocmeta:default", "qcode": qcode}
                for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']",
                                                      namespaces={'iptc': IPTC_NS}):
                    qcode = broader_elt.get('qcode')
                    if not qcode:
                        continue
                    for key, mapping in STT_LOCATION_MAP.items():
                        if qcode.startswith(key + ":"):
                            if "qcode" in mapping:
                                qcode = qcode[len(key) + 1:]
                            try:
                                name = broader_elt.find(self.qname('name')).text
                            except AttributeError:
                                name = ""
                            try:
                                name = self.getVocabulary(key, qcode, name)
                            except ValueError:
                                continue
                            else:
                                location_data[mapping["qcode"]] = qcode
                                if "name" in mapping:
                                    location_data[mapping["name"]] = name
                item.setdefault('place', []).append(location_data)

            # public editorial note
            if 'ednote' in item:
                # stt has specific roles for public and private editorial notes
                # so we remove ednote found by parent parser, as it takes first one
                # as a public note
                del item['ednote']
            try:
                ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if ednote:
                    item['ednote'] = ednote

            # private editorial note
            try:
                private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if private_note:
                    item.setdefault('extra', {})['sttnote_private'] = private_note

            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{
                            'qcode': '04000000',
                            'name': subject_codes['04000000']
                        }]
                        item.setdefault('word_count',
                                        get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [
                            c for c in app.locators.find_cities(
                                country_code='NZ', state_code='NZ.G2')
                            if c.get('city', '').lower() == 'wellington'
                        ]
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'),
                                                content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(
                                        par) == 0 and par.text.startswith(
                                            'By '):
                                    item['byline'] = par.text.replace(
                                        'By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(
                                    ' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(
                                        date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(
                                        par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed,
                                                          remove_root_div=True)
                        locator_map = superdesk.get_resource_service(
                            'vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [
                                x for x in locator_map.get('items', [])
                                if x['qcode'].upper() == 'NZ'
                            ]

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 11
0
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    item = self.parse_item(item_tree)
                    try:
                        published = item_tree.xpath('.//xhtml:body/xhtml:header/'
                                                    'xhtml:time[@class="publicationDate"]/@data-datetime',
                                                    namespaces=NS)[0]
                    except IndexError:
                        item['firstcreated'] = item['versioncreated']
                    else:
                        item['firstcreated'] = dateutil.parser.parse(published)
                    item['firstcreated'] = item['firstcreated'].astimezone(pytz.utc)
                    item['versioncreated'] = item['versioncreated'].astimezone(pytz.utc)

                    if item['urgency'] == 4:
                        item['urgency'] = 3

                    # mapping services-products
                    for cat in item.get('anpa_category', []):
                        qcode = self.MAPPING_CATEGORY.get(
                            cat.get('qcode', '').upper(),
                            'NEWS/GENERAL'
                        )
                        item.setdefault('subject', []).append({
                            'name': qcode,
                            'qcode': qcode,
                            'parent': 'NEWS',
                            'scheme': 'services-products'
                        })
                        break
                    else:
                        item.setdefault('subject', []).append({
                            'name': 'NEWS/GENERAL',
                            'qcode': 'NEWS/GENERAL',
                            'parent': 'NEWS',
                            'scheme': 'services-products'
                        })

                    # Source is DPA
                    credit = {"name": 'DPA', "qcode": 'DPA', "scheme": "sources"}
                    item.setdefault('subject', []).append(credit)
                    # Distribution is default
                    dist = {"name": 'default', "qcode": 'default', "scheme": "distribution"}
                    item.setdefault('subject', []).append(dist)
                    # Slugline and keywords is epmty
                    item['slugline'] = None
                    item['keywords'] = []
                    # Find genres and verify their roles and qcodes to acceptance criteria.
                    genres = item_tree.xpath('//iptc:genre', namespaces=NS)
                    for genre in genres:
                        genre_qcode = genre.get('qcode')
                        if genre_qcode and genre_qcode != 'dpatextgenre:1':
                            genre_names = genre.findall(self.qname('name'))
                            if genre_names:
                                for genre_name in genre_names:
                                    try:
                                        genre_role = genre_name.attrib['role']
                                        if genre_role == 'nrol:display':
                                            item['headline'] = "({genre}): {headline}".format(
                                                genre=genre_name.text, headline=item['headline']
                                            )
                                            break
                                    except KeyError:
                                        continue

                    # remove duplicated subject
                    item['subject'] = [
                        dict(i) for i, _ in itertools.groupby(sorted(item['subject'], key=lambda k: k['qcode']))
                    ]
                    items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Esempio n. 12
0
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname("itemSet")):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if "guid" in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item["priority"] = 6
                        item["anpa_category"] = [{"qcode": "f"}]
                        item["subject"] = [{
                            "qcode": "04000000",
                            "name": subject_codes["04000000"]
                        }]
                        item.setdefault("word_count",
                                        get_word_count(item["body_html"]))
                        # Hard code the urgency
                        item["urgency"] = 3
                        # Dateline is always Wellington in NZ
                        located = [
                            c for c in app.locators.find_cities(
                                country_code="NZ", state_code="NZ.G2")
                            if c.get("city", "").lower() == "wellington"
                        ]
                        if len(located) == 1:
                            item["dateline"] = dict()
                            item["dateline"]["located"] = located[0]

                        if item.get("body_html") and item["dateline"]:
                            parsed = parse_html(item.get("body_html"),
                                                content="xml")
                            pars = parsed.xpath("//p")
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(
                                        par) == 0 and par.text.startswith(
                                            "By "):
                                    item["byline"] = par.text.replace(
                                        "By ", "")
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(
                                    " (BusinessDesk) - ")
                                if source:
                                    item["dateline"]["date"] = date_parser(
                                        date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == "(BusinessDesk)" and pars.index(
                                        par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item["body_html"] = to_string(parsed,
                                                          remove_root_div=True)
                        locator_map = superdesk.get_resource_service(
                            "vocabularies").find_one(req=None, _id="locators")
                        if locator_map:
                            item["place"] = [
                                x for x in locator_map.get("items", [])
                                if x["qcode"].upper() == "NZ"
                            ]

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    item = self.parse_item(item_tree)
                    try:
                        published = item_tree.xpath(
                            './/xhtml:body/xhtml:header/'
                            'xhtml:time[@class="publicationDate"]/@data-datetime',
                            namespaces=NS)[0]
                    except IndexError:
                        item['firstcreated'] = item['versioncreated']
                    else:
                        item['firstcreated'] = dateutil.parser.parse(published)
                    item['firstcreated'] = item['firstcreated'].astimezone(
                        pytz.utc)
                    item['versioncreated'] = item['versioncreated'].astimezone(
                        pytz.utc)

                    if item['urgency'] == 4:
                        item['urgency'] = 3

                    # mapping services-products
                    for cat in item.get('anpa_category', []):
                        qcode = self.MAPPING_CATEGORY.get(
                            cat.get('qcode', '').upper(), 'NEWS/GENERAL')
                        item.setdefault('subject', []).append({
                            'name':
                            qcode,
                            'qcode':
                            qcode,
                            'parent':
                            'NEWS',
                            'scheme':
                            'services-products'
                        })
                        break
                    else:
                        item.setdefault('subject', []).append({
                            'name':
                            'NEWS/GENERAL',
                            'qcode':
                            'NEWS/GENERAL',
                            'parent':
                            'NEWS',
                            'scheme':
                            'services-products'
                        })

                    # Source is DPA
                    credit = {
                        "name": 'DPA',
                        "qcode": 'DPA',
                        "scheme": "sources"
                    }
                    item.setdefault('subject', []).append(credit)
                    # Distribution is default
                    dist = {
                        "name": 'default',
                        "qcode": 'default',
                        "scheme": "distribution"
                    }
                    item.setdefault('subject', []).append(dist)
                    # Slugline and keywords is epmty
                    item['slugline'] = None
                    item['keywords'] = []
                    items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)