def dpa_derive_dateline(item, **kwargs):
    """
    DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of
    the item body and populate the dataline location, it also populates the dateline source.
    If a dateline is matched the coresponding string is removed from the article text.
    :param item:
    :param kwargs:
    :return:
    """
    lines = item["body_html"].splitlines()
    if lines:
        # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
        for line_num in range(0, min(len(lines), 5)):
            city, source, the_rest = lines[line_num].partition(" (dpa) - ")
            # test if we found a candidate and ensure that the city starts the line and is not crazy long
            if source and lines[line_num].find(city) == 0 and len(city) < 20:
                cities = find_cities()
                located = [c for c in cities if c["city"].lower() == city.lower()]
                if "dateline" not in item:
                    item["dateline"] = {}
                item["dateline"]["located"] = (
                    located[0]
                    if len(located) > 0
                    else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                )
                item["dateline"]["source"] = "dpa"
                item["dateline"]["text"] = city
                lines[line_num] = lines[line_num].replace(city + source, "")
                item["body_html"] = "\r\n".join(lines)
                break
    return item
Beispiel #2
0
    def set_dateline(self, item, city=None, text=None):
        """
        Sets the 'dateline' to the article identified by item. If city is passed then the system checks if city is
        available in Cities collection. If city is not found in Cities collection then dateline's located is set with
        default values.

        :param item: article.
        :type item: dict
        :param city: Name of the city, if passed the system will search in Cities collection.
        :type city: str
        :param text: dateline in full. For example, "STOCKHOLM, Aug 29, 2014"
        :type text: str
        """

        item.setdefault('dateline', {})

        if city:
            cities = find_cities()
            located = [c for c in cities if c['city'] == city]
            item['dateline']['located'] = located[0] if len(located) > 0 else {
                'city_code': city,
                'city': city,
                'tz': 'UTC',
                'dateline': 'city'
            }
        if text:
            item['dateline']['text'] = text
Beispiel #3
0
def dpa_derive_dateline(item, **kwargs):
    """
    DPA content is recieved in IPTC7901 format, this macro attempts to parse a dateline from the first few lines of
    the item body and populate the dataline location, it also populates the dateline source.
    If a dateline is matched the coresponding string is removed from the article text.
    :param item:
    :param kwargs:
    :return:
    """
    lines = item['body_html'].splitlines()
    if lines:
        # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
        for line_num in range(0, min(len(lines), 5)):
            city, source, the_rest = lines[line_num].partition(' (dpa) - ')
            # test if we found a candidate and ensure that the city starts the line and is not crazy long
            if source and lines[line_num].find(city) == 0 and len(city) < 20:
                cities = find_cities()
                located = [
                    c for c in cities if c['city'].lower() == city.lower()
                ]
                if 'dateline' not in item:
                    item['dateline'] = {}
                item['dateline']['located'] = located[0] if len(
                    located) > 0 else {
                        'city_code': city,
                        'city': city,
                        'tz': 'UTC',
                        'dateline': 'city'
                    }
                item['dateline']['source'] = 'dpa'
                item['dateline']['text'] = city
                lines[line_num] = lines[line_num].replace(city + source, '')
                item['body_html'] = '\r\n'.join(lines)
                break
    return item
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if "anpa_category" not in item:
            category_map = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="categories")
            if category_map:
                map_entry = next(
                    (code for code in category_map["items"] if code["qcode"] == "e" and code["is_active"]), None
                )
                item["anpa_category"] = [{"qcode": "e", "name": map_entry["name"]}]

        if "subject" not in item:
            qcode = "01000000"
            item["subject"] = [{"qcode": qcode, "name": subject_codes[qcode]}]

        cities = find_cities(country_code="AU", state_code="NSW")
        located = [c for c in cities if c["city"].lower() == "sydney"]

        if located and "dateline" not in item:
            item["dateline"] = {"date": item["firstcreated"], "located": located[0]}
        item["dateline"]["source"] = item["source"]
        item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item["firstcreated"]), source=item["source"]
        )

        return item
    except Exception as ex:
        logger.exception(ex)
def update_to_pass_validation(item, **kwargs):
    """
    This is a test macro that does what is required to ensure that a text item will pass publication validation.
    It is intended to be used to test auto publishing, that is publishing directly from ingest.
    At the moment virtually all content received from Reuters fails validation.
    :param item:
    :param kwargs:
    :return:
    """
    try:
        lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
        validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            item['slugline'] = item['slugline'][:max_slugline_len] \
                if len(item['slugline']) > max_slugline_len else item['slugline']
            item['headline'] = item['headline'][:max_headline_len] \
                if len(item['headline']) > max_headline_len else item['headline']
        if 'dateline' not in item:
            cities = find_cities(country_code='AU', state_code='NSW')
            located = [c for c in cities if c['city'].lower() == 'sydney']
            if located:
                item['dateline'] = {'date': item['firstcreated'], 'located': located[0]}
            item['dateline']['source'] = item['source']
            item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']),
                                                                      source=item['source'])
        return item
    except:
        logging.exception('Test update to pass validation macro exception')
def update_to_pass_validation(item, **kwargs):
    """
    This is a test macro that does what is required to ensure that a text item will pass publication validation.
    It is intended to be used to test auto publishing, that is publishing directly from ingest.
    At the moment virtually all content received from Reuters fails validation.
    :param item:
    :param kwargs:
    :return:
    """
    try:
        lookup = {'act': ITEM_PUBLISH, 'type': CONTENT_TYPE.TEXT}
        validators = superdesk.get_resource_service('validators').get(req=None, lookup=lookup)
        if validators.count():
            max_slugline_len = validators[0]['schema']['slugline']['maxlength']
            max_headline_len = validators[0]['schema']['headline']['maxlength']
            item['slugline'] = item['slugline'][:max_slugline_len] \
                if len(item['slugline']) > max_slugline_len else item['slugline']
            item['headline'] = item['headline'][:max_headline_len] \
                if len(item['headline']) > max_headline_len else item['headline']
        if 'dateline' not in item:
            cities = find_cities(country_code='AU', state_code='NSW')
            located = [c for c in cities if c['city'].lower() == 'sydney']
            if located:
                item['dateline'] = {'date': item['firstcreated'], 'located': located[0]}
            item['dateline']['source'] = item['source']
            item['dateline']['text'] = format_dateline_to_locmmmddsrc(located[0], get_date(item['firstcreated']),
                                                                      source=item['source'])
        return item
    except:
        logging.exception('Test update to pass validation macro exception')
 def dpa_derive_dateline(self, item):
     """
     This function attempts to parse a dateline from the first few lines of
     the item body and populate the dataline location, it also populates the dateline source.
     If a dateline is matched the coresponding string is removed from the article text.
     :param item:
     :return:
     """
     lines = item['body_html'].splitlines()
     if lines:
         # expect the dateline in the first 5 lines, sometimes there is what appears to be a headline preceeding it.
         for line_num in range(0, min(len(lines), 5)):
             city, source, the_rest = lines[line_num].partition(' (dpa) - ')
             # test if we found a candidate and ensure that the city starts the line and is not crazy long
             if source and lines[line_num].find(city) == 0 and len(city.strip()) < 20:
                 cities = find_cities()
                 located = [c for c in cities if c['city'].lower() == city.strip().lower()]
                 if 'dateline' not in item:
                     item['dateline'] = {}
                 item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city.strip(),
                                                                                    'city': city.strip(),
                                                                                    'tz': 'UTC', 'dateline': 'city'}
                 item['dateline']['source'] = 'dpa'
                 item['dateline']['text'] = city.strip()
                 item['body_html'] = item['body_html'].replace(city + source, '', 1)
                 break
     return item
Beispiel #8
0
    def ap_derive_dateline(self, item):
        """
        This function looks for a dateline in the article body an uses that.
        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                soup = BeautifulSoup(html, "html.parser")
                pars = soup.findAll('p')
                if len(pars) >= 2:
                    first = pars[0].get_text()
                    city, source, the_rest = first.partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = find_cities()
                        located = [c for c in cities if c['city'].lower() == city.lower()]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                           'city': city,
                                                                                           'tz': 'UTC',
                                                                                           'dateline': 'city'}
                        item['dateline']['source'] = item.get('original_source', 'AP')
                        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                  get_date(item['firstcreated']),
                                                                                  source=item.get('original_source',
                                                                                                  'AP'))

            return item
        except:
            logging.exception('AP dateline extraction exception')
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            soup = BeautifulSoup(html, "html.parser")
            pars = soup.findAll('p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = find_cities()
                    located = [
                        c for c in cities if c['city'].lower() == city.lower()
                    ]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item[
                            'dateline']['located'].get('city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(
                        located) > 0 else {
                            'city_code': city,
                            'city': city,
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = item.get(
                        'original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                        item['dateline']['located'],
                        get_date(item['firstcreated']),
                        source=item.get('original_source', 'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get("body_html")
        if html:
            soup = BeautifulSoup(html, "html.parser")
            pars = soup.findAll("p")
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(" (Reuters) - ")
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(",")[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = find_cities()
                    located = [c for c in cities if c["city"].lower() == city.lower()]
                    # if not dateline we create one
                    if "dateline" not in item:
                        item["dateline"] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif (
                        "located" in item["dateline"] and "BANGALORE" != item["dateline"]["located"].get("city").upper()
                    ):
                        return

                    item["dateline"]["located"] = (
                        located[0]
                        if len(located) > 0
                        else {"city_code": city, "city": city, "tz": "UTC", "dateline": "city"}
                    )
                    item["dateline"]["source"] = item.get("original_source", "Reuters")
                    item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                        item["dateline"]["located"],
                        get_date(item["firstcreated"]),
                        source=item.get("original_source", "Reuters"),
                    )

        return item
    except:
        logging.exception("Reuters dateline macro exception")
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            soup = BeautifulSoup(html)
            pars = soup.findAll('p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in pars[0].get_text():
                    first = pars[1].get_text()
                else:
                    first = pars[0].get_text()
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = find_cities()
                    located = [c for c in cities if c['city'].lower() == city.lower()]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get(
                            'city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                       'city': city,
                                                                                       'tz': 'UTC',
                                                                                       'dateline': 'city'}
                    item['dateline']['source'] = item.get('original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                              get_date(item['firstcreated']),
                                                                              source=item.get('original_source',
                                                                                              'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
    def set_dateline(self, item, city=None, text=None):
        """
        Sets the 'dateline' to the article identified by item. If city is passed then the system checks if city is
        available in Cities collection. If city is not found in Cities collection then dateline's located is set with
        default values.

        :param item: article.
        :param city: Name of the city, if passed the system will search in Cities collection.
        :param text: dateline in full. For example, "STOCKHOLM, Aug 29, 2014"
        """

        item['dateline'] = {}

        if city:
            cities = find_cities()
            located = [c for c in cities if c['city'] == city]
            item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                               'tz': 'UTC', 'dateline': 'city'}
        if text:
            item['dateline']['text'] = text
def noise11_derive_metadata(item, **kwargs):
    """
    By definition anyhting from NOISE11 will be entertainment so set the category, subject and dateline
    appropriately
    :param item:
    :param kwargs:
    :return:
    """
    try:
        if 'anpa_category' not in item:
            category_map = superdesk.get_resource_service(
                'vocabularies').find_one(req=None, _id='categories')
            if category_map:
                map_entry = next(
                    (code for code in category_map['items']
                     if code['qcode'] == 'e' and code['is_active']), None)
                item['anpa_category'] = [{
                    'qcode': 'e',
                    'name': map_entry['name']
                }]

        if 'subject' not in item:
            qcode = '01000000'
            item['subject'] = [{'qcode': qcode, 'name': subject_codes[qcode]}]

        cities = find_cities(country_code='AU', state_code='NSW')
        located = [c for c in cities if c['city'].lower() == 'sydney']

        if located and 'dateline' not in item:
            item['dateline'] = {
                'date': item['firstcreated'],
                'located': located[0]
            }
        item['dateline']['source'] = item['source']
        item['dateline']['text'] = format_dateline_to_locmmmddsrc(
            located[0], get_date(item['firstcreated']), source=item['source'])

        return item
    except Exception as ex:
        logger.exception(ex)