def update_to_pass_validation(item, **kwargs):
    """
    This is a test macro that does what is required to ensure that a text item will pass publication validation.
    It is intended to be used to test auto publishing, that is publishing directly from ingest.
    At the moment virtually all content received from Reuters fails validation.
    :param item:
    :param kwargs:
    :return:
    """
    try:
        lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
        validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            item['slugline'] = item['slugline'][:max_slugline_len] \
                if len(item['slugline']) > max_slugline_len else item['slugline']
            item['headline'] = item['headline'][:max_headline_len] \
                if len(item['headline']) > max_headline_len else item['headline']
        if 'dateline' not in item:
            cities = app.locators.find_cities(country_code='AU', state_code='NSW')
            located = [c for c in cities if c['city'].lower() == 'sydney']
            if located:
                item['dateline'] = {'date': item['firstcreated'], 'located': located[0]}
            item['dateline']['source'] = item['source']
            item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']),
                                                                      source=item['source'])
        return item
    except:
        logging.exception('Test update to pass validation macro exception')
Example #2
0
    def test_format_dateline_to_format_when_city_state_and_country_are_present(self):
        located, formatted_date, current_ts = self._get_located_and_current_utc_ts()

        located['dateline'] = "city,state,country"
        formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
        self.assertEqual(formatted_dateline, 'SYDNEY, NSW, AU %s %s -' % (formatted_date,
                                                                          ORGANIZATION_NAME_ABBREVIATION))
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if "anpa_category" not in item:
            category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories")
            if category_map:
                map_entry = next(
                    (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None
                )
                item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}]

        if "subject" not in item:
            qcode = "01000000"
            item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}]

        cities = find_cities(country_code="AU", state_code="NSW")
        located = [c for c in cities if c["city"].lower() == "sydney"]

        if located and "dateline" not in item:
            item["dateline"] = {"date": item["firstcreated"], "located": located[0]}
        item["dateline"]["source"] = item["source"]
        item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item["firstcreated"]), source=item["source"]
        )

        return item
    except Exception as ex:
        logger.exception(ex)
Example #4
0
    def ap_derive_dateline(self, item):
        """
        This function looks for a dateline in the article body an uses that.
        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                soup = BeautifulSoup(html, "html.parser")
                pars = soup.findAll('p')
                for par in pars:
                    city, source, the_rest = par.get_text().partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c['city'].lower() == city.lower()]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                           'city': city,
                                                                                           'tz': 'UTC',
                                                                                           'dateline': 'city'}
                        item['dateline']['source'] = item.get('original_source', 'AP')
                        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                  get_date(item['firstcreated']),
                                                                                  source=item.get('original_source',
                                                                                                  'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')
 def derive_dateline(self, item):
     """
     Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we
     have the correct country.
     :param item:
     :return:
     """
     try:
         if len(item.get('place', [])) == 1:
             cities = app.locators.find_cities()
             city = item.get('place', '')[0].get('name', '')
             if city:
                 located = [c for c in cities if c['city'].lower() == city.lower()]
                 if len(located) == 1:
                     item.setdefault('dateline', {})
                     item['dateline']['located'] = located[0]
                     item['dateline']['source'] = item.get('original_source', 'EFE')
                     item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                               get_date(item['firstcreated']),
                                                                               source=item.get('original_source',
                                                                                               'EFE'))
     except Exception as ex:
         logging.exception('EFE dateline extraction exception {}'.format(ex))
     finally:
         item.pop('place', None)
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get("body_html")
        if html:
            soup = BeautifulSoup(html, "html.parser")
            pars = soup.findAll("p")
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(" (Reuters) - ")
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(",")[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = find_cities()
                    located = [c for c in cities if c["city"].lower() == city.lower()]
                    # if not dateline we create one
                    if "dateline" not in item:
                        item["dateline"] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif (
                        "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper()
                    ):
                        return

                    item["dateline"]["located"] = (
                        located[0]
                        if len(located) > 0
                        else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                    )
                    item["dateline"]["source"] = item.get("original_source", "Reuters")
                    item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                        item["dateline"]["located"],
                        get_date(item["firstcreated"]),
                        source=item.get("original_source", "Reuters"),
                    )

        return item
    except:
        logging.exception("Reuters dateline macro exception")
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()):
                    first = ''.join(pars[1].itertext())
                else:
                    first = ''.join(pars[0].itertext())
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [c for c in cities if c['city'].lower() == city.lower()]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get(
                            'city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                       'city': city,
                                                                                       'tz': 'UTC',
                                                                                       'dateline': 'city'}
                    item['dateline']['source'] = item.get('original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                              get_date(item['firstcreated']),
                                                                              source=item.get('original_source',
                                                                                              'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
Example #8
0
    def ap_derive_dateline(self, item):
        """This function looks for a dateline in the article body an uses that.

        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                parsed = parse_html(html, content='html')
                for par in parsed.xpath('/html/div/child::*'):
                    if not par.text:
                        continue
                    city, source, the_rest = par.text.partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [
                            c for c in cities
                            if c['city'].lower() == city.lower()
                        ]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(
                            located) == 1 else {
                                'city_code': city,
                                'city': city,
                                'tz': 'UTC',
                                'dateline': 'city'
                            }
                        item['dateline']['source'] = item.get(
                            'original_source', 'AP')
                        item['dateline'][
                            'text'] = format_dateline_to_locmmmddsrc(
                                item['dateline']['located'],
                                get_date(item['firstcreated']),
                                source=item.get('original_source', 'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')
Example #9
0
def get_item_from_template(template):
    """Get item dict using data from template.

    :param dict template
    """
    item = template.get('data', {})
    item[ITEM_STATE] = CONTENT_STATE.SUBMITTED
    item['task'] = {'desk': template.get('template_desk'), 'stage': template.get('template_stage')}
    item['template'] = template.get('_id')
    item.pop('firstcreated', None)
    item.pop('versioncreated', None)

    # handle dateline
    dateline = item.get('dateline', {})
    dateline['date'] = utcnow()
    if dateline.get('located'):
        dateline['text'] = format_dateline_to_locmmmddsrc(dateline['located'], dateline['date'])

    return item
Example #10
0
def set_dateline(item, city, source, set_date=False, text=None):
    """Set the dateline for item"""
    if not city:
        return

    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    item.setdefault('dateline', {})
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    if set_date:
        item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                          tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = source
    if text:
        item['dateline']['text'] = text
    else:
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                  get_date(item['firstcreated']),
                                                                  source=source)
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if 'anpa_category' not in item:
            category_map = superdesk.get_resource_service(
                'vocabularies').find_one(req=None, _id='categories')
            if category_map:
                map_entry = next(
                    (code for code in category_map['items']
                     if code['qcode'] == 'e' and code['is_active']), None)
                item['anpa_category'] = [{
                    'qcode': 'e',
                    'name': map_entry['name']
                }]

        if 'subject' not in item:
            qcode = '01000000'
            item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}]

        cities = find_cities(country_code='AU', state_code='NSW')
        located = [c for c in cities if c['city'].lower() == 'sydney']

        if located and 'dateline' not in item:
            item['dateline'] = {
                'date': item['firstcreated'],
                'located': located[0]
            }
        item['dateline']['source'] = item['source']
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item['firstcreated']), source=item['source'])

        return item
    except Exception as ex:
        logger.exception(ex)
Example #12
0
    def ap_derive_dateline(self, item):
        """This function looks for a dateline in the article body an uses that.

        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get("body_html")
            if html:
                parsed = parse_html(html, content="html")
                for par in parsed.xpath("/div/child::*"):
                    if not par.text:
                        continue
                    city, source, the_rest = par.text.partition(" (AP) _ ")
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(",")[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c["city"].lower() == city.lower()]
                        item.setdefault("dateline", {})
                        item["dateline"]["located"] = (
                            located[0]
                            if len(located) == 1
                            else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                        )
                        item["dateline"]["source"] = item.get("original_source", "AP")
                        item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                            item["dateline"]["located"],
                            get_date(item["firstcreated"]),
                            source=item.get("original_source", "AP"),
                        )
                        break

            return item
        except Exception:
            logging.exception("AP dateline extraction exception")
Example #13
0
 def derive_dateline(self, item):
     """
     Attempt to derive a dateline using the place, only if there is exactly one match on the city can we be sure we
     have the correct country.
     :param item:
     :return:
     """
     try:
         if len(item.get('place', [])) == 1:
             cities = app.locators.find_cities()
             city = item.get('place', '')[0].get('name', '')
             located = [c for c in cities if c['city'].lower() == city.lower()]
             if len(located) == 1:
                 item.setdefault('dateline', {})
                 item['dateline']['located'] = located[0]
                 item['dateline']['source'] = item.get('original_source', 'EFE')
                 item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                           get_date(item['firstcreated']),
                                                                           source=item.get('original_source',
                                                                                           'EFE'))
             item.pop('place')
     except Exception:
         logging.exception('EFE dateline extraction exception')
Example #14
0
def update_to_pass_validation(item, **kwargs):
    """
    This is a test macro that does what is required to ensure that a text item will pass publication validation.
    It is intended to be used to test auto publishing, that is publishing directly from ingest.
    At the moment virtually all content received from Reuters fails validation.
    :param item:
    :param kwargs:
    :return:
    """
    try:
        lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
        validators = superdesk.get_resource_service('validators').get(
            req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            item['slugline'] = item['slugline'][:max_slugline_len] \
                if len(item['slugline']) > max_slugline_len else item['slugline']
            item['headline'] = item['headline'][:max_headline_len] \
                if len(item['headline']) > max_headline_len else item['headline']
        if 'dateline' not in item:
            cities = find_cities(country_code='AU', state_code='NSW')
            located = [c for c in cities if c['city'].lower() == 'sydney']
            if located:
                item['dateline'] = {
                    'date': item['firstcreated'],
                    'located': located[0]
                }
            item['dateline']['source'] = item['source']
            item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                located[0],
                get_date(item['firstcreated']),
                source=item['source'])
        return item
    except:
        logging.exception('Test update to pass validation macro exception')
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party")
            if parsed_el is not None:
                item["original_source"] = parsed_el.attrib.get(
                    "FormalName", "ANA")

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                "NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(
                    language) else ""

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item["subject"] = self.format_subjects(subjects)

            item["body_html"] = (html.unescape(
                etree.tostring(xml.find(
                    "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent"
                ),
                               encoding="unicode")).replace(
                                   "<DataContent>",
                                   "").replace("</DataContent>", "").replace(
                                       "<P>", "<p>").replace("</P>", "</p>"))

            item["body_html"] = (item.get("body_html").replace(
                "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο "
                "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον "
                "για συγκεκριμένη χρήση.</p>",
                "",
            ).strip())
            parsed_el = xml.findall(
                "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property"
            )
            characteristics = self.parse_attribute_values(
                parsed_el, "WordCount")
            item["word_count"] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get("Value")
            # Anglicise the greek for Athens if required
            city = "Athens" if city == "Αθήνα" else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get("Value")
            # Normalise the country code
            country = "GR" if country == "GRC" else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c["city"] == city and c["country_code"] == country
            ]
            if len(located) == 1:
                item["dateline"]["located"] = located[0]
                item["dateline"]["source"] = provider.get("source")
                item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                    item["dateline"]["located"],
                    item.get("dateline", {}).get("date"),
                    provider.get("source"))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Example #16
0
    def _parse_formatted_email(self, data, provider):
        """Construct an item from an email that was constructed as a notification from a google form submission.

        The google form submits to a google sheet, this sheet creates the email as a notification

        :param data:
        :param provider:
        :return: A list of 1 item
        """
        try:
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item["versioncreated"] = utcnow()
            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    # Check that the subject line matches what we expect, ignore it if not
                    if self.parse_header(
                            msg["subject"]) != "Formatted Editorial Story":
                        return []

                    item["guid"] = msg["Message-ID"]
                    date_tuple = email.utils.parsedate_tz(msg["Date"])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone("utc"))
                        item["firstcreated"] = dt

                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            # if we don't know the charset just have a go!
                            if part.get_content_charset() is None:
                                json_str = body.decode().replace("\r\n",
                                                                 "").replace(
                                                                     "  ", " ")
                            else:
                                charset = part.get_content_charset()
                                json_str = body.decode(charset).replace(
                                    "\r\n", "").replace("  ", " ")

                            mail_item = dict(
                                (k, v[0])
                                for k, v in json.loads(json_str).items())

                            self._expand_category(item, mail_item)

                            item["original_source"] = mail_item.get(
                                "Username", mail_item.get("Email Address", ""))
                            item["headline"] = mail_item.get("Headline", "")
                            item["abstract"] = mail_item.get("Abstract", "")
                            item["slugline"] = mail_item.get("Slugline", "")
                            item["body_html"] = "<p>" + mail_item.get(
                                "Body", "").replace("\n", "</p><p>") + "</p>"

                            default_source = app.config.get(
                                "DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES")
                            city = mail_item.get("Dateline", "")
                            cities = app.locators.find_cities()
                            located = [
                                c for c in cities
                                if c["city"].lower() == city.lower()
                            ]
                            item.setdefault("dateline", {})
                            item["dateline"]["located"] = (
                                located[0] if len(located) > 0 else {
                                    "city_code": city,
                                    "city": city,
                                    "tz": "UTC",
                                    "dateline": "city"
                                })
                            item["dateline"]["source"] = default_source
                            item["dateline"][
                                "text"] = format_dateline_to_locmmmddsrc(
                                    item["dateline"]["located"],
                                    get_date(item["firstcreated"]),
                                    source=default_source)

                            if mail_item.get("Priority") != "":
                                if mail_item.get("Priority", "3").isdigit():
                                    item["priority"] = int(
                                        mail_item.get("Priority", "3"))
                                else:
                                    priority_map = superdesk.get_resource_service(
                                        "vocabularies").find_one(
                                            req=None, _id="priority")
                                    priorities = [
                                        x
                                        for x in priority_map.get("items", [])
                                        if x["name"].upper() == mail_item.get(
                                            "Priority", "").upper()
                                    ]
                                    if priorities is not None and len(
                                            priorities) > 0:
                                        item["priority"] = int(
                                            priorities[0].get("qcode", "3"))
                                    else:
                                        item["priority"] = 3
                            if mail_item.get("News Value") != "":
                                item["urgency"] = int(
                                    mail_item.get("News Value", "3"))

                            # We expect the username passed corresponds to a superdesk user
                            query = {
                                "email":
                                re.compile(
                                    "^{}$".format(
                                        mail_item.get(
                                            "Username",
                                            mail_item.get("Email Address",
                                                          ""))),
                                    re.IGNORECASE,
                                )
                            }
                            user = superdesk.get_resource_service(
                                "users").find_one(req=None, **query)
                            if not user:
                                logger.error(
                                    "Failed to find user for email {}".format(
                                        mail_item.get(
                                            "Username",
                                            mail_item.get("Email Address",
                                                          ""))))
                                raise UserNotRegisteredException()
                            item["original_creator"] = user.get("_id")
                            if BYLINE in user and user.get(BYLINE, ""):
                                item["byline"] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {
                                "name":
                                re.compile(
                                    "^{}$".format(mail_item.get("Desk", "")),
                                    re.IGNORECASE)
                            }
                            desk = superdesk.get_resource_service(
                                "desks").find_one(req=None, **query)
                            if desk:
                                item["task"] = {
                                    "desk": desk.get("_id"),
                                    "stage": desk.get("incoming_stage")
                                }

                            if "Place" in mail_item:
                                locator_map = superdesk.get_resource_service(
                                    "vocabularies").find_one(req=None,
                                                             _id="locators")
                                place = [
                                    x for x in locator_map.get("items", [])
                                    if x["qcode"] == mail_item.get(
                                        "Place", "").upper()
                                ]
                                if place is not None:
                                    item["place"] = place

                            if mail_item.get("Legal flag", "") == "LEGAL":
                                item["flags"] = {"marked_for_legal": True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Example #17
0
    def parse(self, s_json, provider=None):
        in_item = s_json.get('data', {}).get('item')
        nitf_item = s_json.get('nitf', {})
        item = {
            'guid':
            in_item.get('altids', {}).get('itemid') + ':' +
            str(in_item.get('version'))
        }
        item['source'] = provider.get('source') if provider else 'AP'

        for copy_property in self.direct_copy_properties:
            if in_item.get(copy_property) is not None:
                item[copy_property] = in_item[copy_property]

        if in_item.get('version'):
            item['version'] = in_item['version']

        if in_item.get('versioncreated'):
            item['versioncreated'] = self.datetime(
                in_item.get('versioncreated'))

        if in_item.get('firstcreated'):
            item['firstcreated'] = self.datetime(in_item.get('firstcreated'))

        if len(in_item.get('infosource', [])):
            item['original_source'] = ','.join(
                [n.get('name') for n in in_item.get('infosource', [])])

        if in_item.get('datelinelocation'):
            cities = app.locators.find_cities()
            # Try to find a single matching city either by city and country or city country and state
            located = [
                c for c in cities
                if c['city'] == in_item.get('datelinelocation').get('city')
                and c['country'] == in_item.get('datelinelocation').get(
                    'countryname')
            ]
            if len(located) > 1:
                located = [
                    c for c in cities
                    if c['city'] == in_item.get('datelinelocation').get('city')
                    and c['country'] == in_item.get('datelinelocation').get(
                        'countryname') and c['state'] == in_item.get(
                            'datelinelocation').get('countryareaname')
                ]
            if len(located) == 1:
                item['dateline'] = dict()
                item['dateline']['located'] = located[0]
                item['dateline']['source'] = provider.get('source')
                item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                    item['dateline']['located'],
                    get_date(item['firstcreated']), provider.get('source'))

        if len(in_item.get('bylines', [])):
            item['byline'] = ','.join([
                n.get('name') if n.get('name') else n.get('by', '') +
                (' ({})'.format(n.get('title')) if n.get('title') else '')
                for n in in_item.get('bylines', [])
            ])
            if item.get('byline').startswith('By '):
                item['byline'] = item['byline'][3:]

        if len(in_item.get('usageterms', [])):
            item['usageterms'] = ', '.join(
                [n for n in in_item.get('usageterms', [])])

        if in_item.get('type') == 'picture':
            if in_item.get('renditions'):
                self._parse_renditions(in_item['renditions'], item, provider)

            if in_item.get('description_caption'):
                item['description_text'] = in_item.get('description_caption')
                item['archive_description'] = in_item.get(
                    'description_caption')

            if in_item.get('description_creditline'):
                item['credit'] = in_item.get('description_creditline')

            if in_item.get('photographer', {}).get('name'):
                item['byline'] = in_item.get('photographer', {}).get('name')

        if in_item.get('type') == 'text':
            # Peel off the take key if possible
            if ',' in item['slugline']:
                item['anpa_take_key'] = item['slugline'].split(',')[1]
                item['slugline'] = item['slugline'].split(',')[0]
            if item['slugline'].startswith('BC-'):
                item['slugline'] = item['slugline'][3:]
            if item.get('ednote', '').startswith('Eds:'):
                item['ednote'] = item['ednote'][5:]
            if in_item.get('headline_extended'):
                item['abstract'] = in_item.get('headline_extended')

            self.categorisation_mapping(in_item, item)

            # Map the urgency to urgency and priority
            if in_item.get('urgency'):
                item[ITEM_URGENCY] = int(
                    in_item['urgency']) if in_item['urgency'] <= 5 else 5
                item[ITEM_PRIORITY] = self.priority_map.get(
                    in_item['urgency'], 5)

            if nitf_item.get('body_html'):
                # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html'))
                item['body_html'] = nitf_item.get('body_html').replace(
                    '<block id="Main">', '').replace('</block>', '')

        if s_json.get('associations'):
            self._parse_associations(s_json['associations'], item, provider)

        return item
Example #18
0
    def test_format_dateline_to_format_when_city_state_and_country_are_present(self):
        located, formatted_date, current_ts = self._get_located_and_current_utc_ts()

        located['dateline'] = "city,state,country"
        formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
        self.assertEqual(formatted_dateline, 'SYDNEY, NSW, AU, %s %s -' % (formatted_date, get_default_source()))
Example #19
0
 def test_format_dateline_to_format_when_only_city_is_present(self):
     located, formatted_date, current_ts = self._get_located_and_current_utc_ts()
     formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
     self.assertEqual(formatted_dateline, 'SYDNEY, %s %s -' % (formatted_date, get_default_source()))
Example #20
0
    def _parse_formatted_email(self, data, provider):
        """Construct an item from an email that was constructed as a notification from a google form submission.

        The google form submits to a google sheet, this sheet creates the email as a notification

        :param data:
        :param provider:
        :return: A list of 1 item
        """
        try:
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()
            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    # Check that the subject line matches what we expect, ignore it if not
                    if self.parse_header(msg['subject']) != 'Formatted Editorial Story':
                        return []

                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            # if we don't know the charset just have a go!
                            if part.get_content_charset() is None:
                                json_str = body.decode().replace('\r\n', '').replace('  ', ' ')
                            else:
                                charset = part.get_content_charset()
                                json_str = body.decode(charset).replace('\r\n', '').replace('  ', ' ')

                            mail_item = dict((k, v[0]) for k, v in json.loads(json_str).items())

                            self._expand_category(item, mail_item)

                            item['original_source'] = mail_item.get('Username', '')
                            item['headline'] = mail_item.get('Headline', '')
                            item['abstract'] = mail_item.get('Abstract', '')
                            item['slugline'] = mail_item.get('Slugline', '')
                            item['body_html'] = '<p>' + mail_item.get('Body', '').replace('\n', '</p><p>') + '</p>'

                            default_source = app.config.get('DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES')
                            city = mail_item.get('Dateline', '')
                            cities = app.locators.find_cities()
                            located = [c for c in cities if c['city'].lower() == city.lower()]
                            item.setdefault('dateline', {})
                            item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                               'city': city,
                                                                                               'tz': 'UTC',
                                                                                               'dateline': 'city'}
                            item['dateline']['source'] = default_source
                            item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                      get_date(item['firstcreated']),
                                                                                      source=default_source)

                            if mail_item.get('Priority') != '':
                                if mail_item.get('Priority', '3').isdigit():
                                    item['priority'] = int(mail_item.get('Priority', '3'))
                                else:
                                    priority_map = superdesk.get_resource_service('vocabularies').find_one(
                                        req=None, _id='priority')
                                    priorities = [x for x in priority_map.get('items', []) if
                                                  x['name'].upper() == mail_item.get('Priority', '').upper()]
                                    if priorities is not None and len(priorities) > 0:
                                        item['priority'] = int(priorities[0].get('qcode', '3'))
                                    else:
                                        item['priority'] = 3
                            if mail_item.get('News Value') != '':
                                item['urgency'] = int(mail_item.get('News Value', '3'))

                            # We expect the username passed corresponds to a superdesk user
                            query = {'email': re.compile('^{}$'.format(mail_item.get('Username')), re.IGNORECASE)}
                            user = superdesk.get_resource_service('users').find_one(req=None, **query)
                            if not user:
                                logger.error('Failed to find user for email {}'.format(mail_item.get('Username')))
                                raise UserNotRegisteredException()
                            item['original_creator'] = user.get('_id')
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {'name': re.compile('^{}$'.format(mail_item.get('Desk', '')), re.IGNORECASE)}
                            desk = superdesk.get_resource_service('desks').find_one(
                                req=None, **query)
                            if desk:
                                item['task'] = {'desk': desk.get('_id'), 'stage': desk.get('incoming_stage')}

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None,
                                                                                                      _id='locators')
                                place = [x for x in locator_map.get('items', []) if
                                         x['qcode'] == mail_item.get('Place', '').upper()]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Example #21
0
    def parse(self, s_json, provider=None):
        in_item = s_json.get("data", {}).get("item")
        nitf_item = s_json.get("nitf", {})
        item = {
            "guid":
            in_item.get("altids", {}).get("itemid") + ":" +
            str(in_item.get("version"))
        }
        item["source"] = provider.get("source") if provider else "AP"

        for copy_property in self.direct_copy_properties:
            if in_item.get(copy_property) is not None:
                item[copy_property] = in_item[copy_property]

        if in_item.get("version"):
            item["version"] = in_item["version"]

        if in_item.get("versioncreated"):
            item["versioncreated"] = self.datetime(
                in_item.get("versioncreated"))

        if in_item.get("firstcreated"):
            item["firstcreated"] = self.datetime(in_item.get("firstcreated"))

        if len(in_item.get("infosource", [])):
            item["original_source"] = ",".join(
                [n.get("name") for n in in_item.get("infosource", [])])

        if in_item.get("datelinelocation"):
            cities = app.locators.find_cities()
            # Try to find a single matching city either by city and country or city country and state
            located = [
                c for c in cities
                if c["city"] == in_item.get("datelinelocation").get("city")
                and c["country"] == in_item.get("datelinelocation").get(
                    "countryname")
            ]
            if len(located) > 1:
                located = [
                    c for c in cities
                    if c["city"] == in_item.get("datelinelocation").get("city")
                    and c["country"] == in_item.get("datelinelocation").get(
                        "countryname") and c["state"] == in_item.get(
                            "datelinelocation").get("countryareaname")
                ]
            if len(located) == 1:
                item["dateline"] = dict()
                item["dateline"]["located"] = located[0]
                item["dateline"]["source"] = provider.get("source")
                item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                    item["dateline"]["located"],
                    get_date(item["firstcreated"]), provider.get("source"))

        if len(in_item.get("bylines", [])):
            item["byline"] = ",".join([
                n.get("name") if n.get("name") else n.get("by", "") +
                (" ({})".format(n.get("title")) if n.get("title") else "")
                for n in in_item.get("bylines", [])
            ])
            if item.get("byline").startswith("By "):
                item["byline"] = item["byline"][3:]

        if len(in_item.get("usageterms", [])):
            item["usageterms"] = ", ".join(
                [n for n in in_item.get("usageterms", [])])

        if in_item.get("type") == "picture":
            if in_item.get("renditions"):
                self._parse_renditions(in_item["renditions"], item, provider)

            if in_item.get("description_caption"):
                item["description_text"] = in_item.get("description_caption")
                item["archive_description"] = in_item.get(
                    "description_caption")

            if in_item.get("description_creditline"):
                item["credit"] = in_item.get("description_creditline")

            if in_item.get("photographer", {}).get("name"):
                item["byline"] = in_item.get("photographer", {}).get("name")

        if in_item.get("type") == "text":
            # Peel off the take key if possible
            if "," in item["slugline"]:
                item["anpa_take_key"] = item["slugline"].split(",")[1]
                item["slugline"] = item["slugline"].split(",")[0]
            if item["slugline"].startswith("BC-"):
                item["slugline"] = item["slugline"][3:]
            if item.get("ednote", "").startswith("Eds:"):
                item["ednote"] = item["ednote"][5:]
            if in_item.get("headline_extended"):
                item["abstract"] = in_item.get("headline_extended")

            self.categorisation_mapping(in_item, item)

            # Map the urgency to urgency and priority
            if in_item.get("urgency"):
                item[ITEM_URGENCY] = int(
                    in_item["urgency"]) if in_item["urgency"] <= 5 else 5
                item[ITEM_PRIORITY] = self.priority_map.get(
                    in_item["urgency"], 5)

            if nitf_item.get("body_html"):
                # item['body_html'] = sd_etree.clean_html_str(nitf_item.get('body_html'))
                item["body_html"] = nitf_item.get("body_html").replace(
                    '<block id="Main">', "").replace("</block>", "")

        if s_json.get("associations"):
            self._parse_associations(s_json["associations"], item, provider)

        return item
    def test_format_dateline_to_format_when_city_state_and_country_are_present(self):
        located, formatted_date, current_ts = self._get_located_and_current_utc_ts()

        located['dateline'] = "city,state,country"
        formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
        self.assertEqual(formatted_dateline, 'SYDNEY, NSW, AU, %s %s -' % (formatted_date, get_default_source()))
def update_dateline(item):
    # handle dateline
    dateline = item.get("dateline", {})
    dateline["date"] = utcnow()
    if dateline.get("located"):
        dateline["text"] = format_dateline_to_locmmmddsrc(dateline["located"], dateline["date"])
Example #24
0
def ap_weather_format(item, **kwargs):
    if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP':
        raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError("Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({'index': 0}, {'index': 1}, {'index': 2},
                 {'index': 3, 'substitute': [('COND', 'CONDITIONS'),
                                             ('pc', 'partly cloudy'), ('clr', 'clear'),
                                             ('cdy', 'cloudy'), ('rn', 'rain'),
                                             ('sn', 'snow')]})
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()
    output.write(preamble)

    # story is always datelined News York
    city = 'New York City'
    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    if 'dateline' not in item:
        item['dateline'] = {}
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                      tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = 'AP'
    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                              get_date(item['firstcreated']),
                                                              source=item.get('original_source', 'AP'))

    item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d')

    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
    item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US']

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split('[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    else:
                        columns.append(len(col))
                    index += 1

        for line in lines:
            row = re.split('[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                        output.write(
                            '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n'))
                    index += 1
                output.write('\r\n')

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
Example #25
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party')
            if parsed_el is not None:
                item['original_source'] = parsed_el.attrib.get(
                    'FormalName', 'ANA')

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(
                    language) else ''

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item['subject'] = self.format_subjects(subjects)

            item['body_html'] = html.unescape(
                etree.tostring(xml.find(
                    'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent'
                ),
                               encoding='unicode')).replace(
                                   '<DataContent>',
                                   '').replace('</DataContent>', '').replace(
                                       '<P>', '<p>').replace('</P>', '</p>')

            item['body_html'] = item.get('body_html').replace(
                '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο '
                'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον '
                'για συγκεκριμένη χρήση.</p>', '').strip()
            parsed_el = xml.findall(
                'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property'
            )
            characteristics = self.parse_attribute_values(
                parsed_el, 'WordCount')
            item['word_count'] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get('Value')
            # Anglicise the greek for Athens if required
            city = 'Athens' if city == 'Αθήνα' else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get('Value')
            # Normalise the country code
            country = 'GR' if country == 'GRC' else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c['city'] == city and c['country_code'] == country
            ]
            if len(located) == 1:
                item['dateline']['located'] = located[0]
                item['dateline']['source'] = provider.get('source')
                item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                    item['dateline']['located'],
                    item.get('dateline', {}).get('date'),
                    provider.get('source'))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Example #26
0
    def _parse_formatted_email(self, data, provider):
        """Construct an item from an email that was constructed as a notification from a google form submission.

        The google form submits to a google sheet, this sheet creates the email as a notification

        :param data:
        :param provider:
        :return: A list of 1 item
        """
        try:
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()
            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    # Check that the subject line matches what we expect, ignore it if not
                    if self.parse_header(
                            msg['subject']) != 'Formatted Editorial Story':
                        return []

                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            # if we don't know the charset just have a go!
                            if part.get_content_charset() is None:
                                json_str = body.decode().replace('\r\n',
                                                                 '').replace(
                                                                     '  ', ' ')
                            else:
                                charset = part.get_content_charset()
                                json_str = body.decode(charset).replace(
                                    '\r\n', '').replace('  ', ' ')

                            mail_item = dict(
                                (k, v[0])
                                for k, v in json.loads(json_str).items())

                            self._expand_category(item, mail_item)

                            item['original_source'] = mail_item.get(
                                'Username', '')
                            item['headline'] = mail_item.get('Headline', '')
                            item['abstract'] = mail_item.get('Abstract', '')
                            item['slugline'] = mail_item.get('Slugline', '')
                            item['body_html'] = '<p>' + mail_item.get(
                                'Body', '').replace('\n', '</p><p>') + '</p>'

                            default_source = app.config.get(
                                'DEFAULT_SOURCE_VALUE_FOR_MANUAL_ARTICLES')
                            city = mail_item.get('Dateline', '')
                            cities = app.locators.find_cities()
                            located = [
                                c for c in cities
                                if c['city'].lower() == city.lower()
                            ]
                            item.setdefault('dateline', {})
                            item['dateline']['located'] = located[0] if len(
                                located) > 0 else {
                                    'city_code': city,
                                    'city': city,
                                    'tz': 'UTC',
                                    'dateline': 'city'
                                }
                            item['dateline']['source'] = default_source
                            item['dateline'][
                                'text'] = format_dateline_to_locmmmddsrc(
                                    item['dateline']['located'],
                                    get_date(item['firstcreated']),
                                    source=default_source)

                            if mail_item.get('Priority') != '':
                                if mail_item.get('Priority', '3').isdigit():
                                    item['priority'] = int(
                                        mail_item.get('Priority', '3'))
                                else:
                                    priority_map = superdesk.get_resource_service(
                                        'vocabularies').find_one(
                                            req=None, _id='priority')
                                    priorities = [
                                        x
                                        for x in priority_map.get('items', [])
                                        if x['name'].upper() == mail_item.get(
                                            'Priority', '').upper()
                                    ]
                                    if priorities is not None and len(
                                            priorities) > 0:
                                        item['priority'] = int(
                                            priorities[0].get('qcode', '3'))
                                    else:
                                        item['priority'] = 3
                            if mail_item.get('News Value') != '':
                                item['urgency'] = int(
                                    mail_item.get('News Value', '3'))

                            # We expect the username passed corresponds to a superdesk user
                            query = {
                                'email':
                                re.compile(
                                    '^{}$'.format(mail_item.get('Username')),
                                    re.IGNORECASE)
                            }
                            user = superdesk.get_resource_service(
                                'users').find_one(req=None, **query)
                            if not user:
                                logger.error(
                                    'Failed to find user for email {}'.format(
                                        mail_item.get('Username')))
                                raise UserNotRegisteredException()
                            item['original_creator'] = user.get('_id')
                            if BYLINE in user and user.get(BYLINE, ''):
                                item['byline'] = user.get(BYLINE)
                            item[SIGN_OFF] = user.get(SIGN_OFF)

                            # attempt to match the given desk name against the defined desks
                            query = {
                                'name':
                                re.compile(
                                    '^{}$'.format(mail_item.get('Desk', '')),
                                    re.IGNORECASE)
                            }
                            desk = superdesk.get_resource_service(
                                'desks').find_one(req=None, **query)
                            if desk:
                                item['task'] = {
                                    'desk': desk.get('_id'),
                                    'stage': desk.get('incoming_stage')
                                }

                            if 'Place' in mail_item:
                                locator_map = superdesk.get_resource_service(
                                    'vocabularies').find_one(req=None,
                                                             _id='locators')
                                place = [
                                    x for x in locator_map.get('items', [])
                                    if x['qcode'] == mail_item.get(
                                        'Place', '').upper()
                                ]
                                if place is not None:
                                    item['place'] = place

                            if mail_item.get('Legal flag', '') == 'LEGAL':
                                item['flags'] = {'marked_for_legal': True}

                            break

            return [item]
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
def ap_weather_format(item, **kwargs):
    if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP':
        raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError("Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({'index': 0}, {'index': 1}, {'index': 2},
                 {'index': 3, 'substitute': [('COND', 'CONDITIONS'),
                                             ('pc', 'partly cloudy'), ('clr', 'clear'),
                                             ('cdy', 'cloudy'), ('rn', 'rain'),
                                             ('sn', 'snow')]})
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()
    output.write(preamble)

    # story is always datelined News York
    city = 'New York City'
    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    if 'dateline' not in item:
        item['dateline'] = {}
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
                                                      tz=timezone(item['dateline']['located']['tz']))
    item['dateline']['source'] = 'AP'
    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                              get_date(item['firstcreated']),
                                                              source=item.get('original_source', 'AP'))

    item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d')

    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
    item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US']

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split('[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    else:
                        columns.append(len(col))
                    index += 1

        for line in lines:
            row = re.split('[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                        output.write(
                            '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n'))
                    index += 1
                output.write('\r\n')

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
 def test_format_dateline_to_format_when_only_city_is_present(self):
     located, formatted_date, current_ts = self._get_located_and_current_utc_ts()
     formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
     self.assertEqual(formatted_dateline, 'SYDNEY, %s %s -' % (formatted_date, get_default_source()))
Example #29
0
 def test_format_dateline_to_format_when_only_city_is_present(self):
     located, formatted_date, current_ts = self._get_located_and_current_utc_ts()
     formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
     self.assertEqual(formatted_dateline, 'SYDNEY %s %s -' % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))
Example #30
0
    def test_format_dateline_to_format_when_only_city_and_state_are_present(self):
        located, formatted_date, current_ts = self._get_located_and_current_utc_ts()

        located["dateline"] = "city,state"
        formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
        self.assertEqual(formatted_dateline, "SYDNEY, NSW %s %s -" % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))
Example #31
0
    def test_format_dateline_to_format_when_only_city_and_country_are_present(self):
        located, formatted_date, current_ts = self._get_located_and_current_utc_ts()

        located['dateline'] = "city,country"
        formatted_dateline = format_dateline_to_locmmmddsrc(located, current_ts)
        self.assertEqual(formatted_dateline, 'SYDNEY, AU %s %s -' % (formatted_date, ORGANIZATION_NAME_ABBREVIATION))