def _create_item(self, data, field_aliases=None): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: dict or None :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) item = dict(type="text") for field in self.item_fields: data_field_name = field_aliases.get(field.name_in_data, field.name_in_data) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) item[field.name] = field_value return item
def _create_item(self, data, field_aliases=None): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: dict or None :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) item = dict(type='text') for field in self.item_fields: data_field_name = field_aliases.get( field.name_in_data, field.name_in_data ) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) item[field.name] = field_value return item
def _create_item(self, data, field_aliases=None): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: list of {field_name: alias} dictionaries or None :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) aliased_fields = set(field_aliases.values()) item = dict(type=CONTENT_TYPE.TEXT) # Only consider fields that are not used as an alias (i.e. used to # populate another field) - unless those fields have their own # aliases, too. # The idea is that if e.g. the main text field is aliased to use the # parsed data's summary field, that summary should not be used to # populate the field it was originally meant for. fields_to_consider = ( f for f in self.item_fields if (f.name_in_data not in aliased_fields) or (f.name_in_data in aliased_fields and f.name_in_data in field_aliases) ) for field in fields_to_consider: data_field_name = field_aliases.get( field.name_in_data, field.name_in_data ) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) item[field.name] = field_value # Some feeds use <content:encoded> tag for storing the main content, # and that tag is parsed differently. If the body_html has not been # found in its default data field and is not aliased, try to # populate it using the aforementioned content field as a fallback. if ( field.name == 'body_html' and not field_value and field.name_in_data not in field_aliases ): try: item['body_html'] = data.content[0].value except: pass # content either non-existent or parsed differently return item
def _create_item(self, data, field_aliases=None, source='source'): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: list of {field_name: alias} dictionaries or None :param str source: the source of provider :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) aliased_fields = set(field_aliases.values()) item = dict(type=CONTENT_TYPE.TEXT) # Only consider fields that are not used as an alias (i.e. used to # populate another field) - unless those fields have their own # aliases, too. # The idea is that if e.g. the main text field is aliased to use the # parsed data's summary field, that summary should not be used to # populate the field it was originally meant for. fields_to_consider = (f for f in self.item_fields if (f.name_in_data not in aliased_fields) or ( f.name_in_data in aliased_fields and f.name_in_data in field_aliases)) utc_now = datetime.utcnow() for field in fields_to_consider: data_field_name = field_aliases.get(field.name_in_data, field.name_in_data) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) field_value = utc_now if field_value > utc_now else field_value item[field.name] = field_value # Some feeds use <content:encoded> tag for storing the main content, # and that tag is parsed differently. If the body_html has not been # found in its default data field and is not aliased, try to # populate it using the aforementioned content field as a fallback. if (field.name == 'body_html' and not field_value and field.name_in_data not in field_aliases): try: item['body_html'] = data.content[0].value except Exception: pass # content either non-existent or parsed differently if not data.get('guidislink') and data.get('link'): item['uri'] = data['link'] scheme, netloc, path, query, fragment = urlsplit(item['uri']) if data.get('guid'): item['guid'] = generate_tag(domain=netloc, id=data.get('guid')) else: item['guid'] = generate_tag_from_url(data['link']) if item.get('uri', None): if not item.get('body_html', None): item['body_html'] = '' item[ 'body_html'] = '<p><a href="%s" target="_blank">%s</a></p>' % ( item['uri'], source) + item['body_html'] item['dateline'] = { 'source': source, 'date': item.get('firstcreated', item.get('versioncreated')) } return item
def _create_item(self, data, field_aliases=None, source='source'): """Create a new content item from RSS feed data. :param dict data: parsed data of a single feed entry :param field_aliases: (optional) field name aliases. Used for content fields that are named differently in retrieved data. :type field_aliases: list of {field_name: alias} dictionaries or None :param str source: the source of provider :return: created content item :rtype: dict """ if field_aliases is None: field_aliases = {} else: field_aliases = merge_dicts(field_aliases) aliased_fields = set(field_aliases.values()) item = dict(type=CONTENT_TYPE.TEXT) # Only consider fields that are not used as an alias (i.e. used to # populate another field) - unless those fields have their own # aliases, too. # The idea is that if e.g. the main text field is aliased to use the # parsed data's summary field, that summary should not be used to # populate the field it was originally meant for. fields_to_consider = ( f for f in self.item_fields if (f.name_in_data not in aliased_fields) or (f.name_in_data in aliased_fields and f.name_in_data in field_aliases) ) utc_now = datetime.utcnow() for field in fields_to_consider: data_field_name = field_aliases.get( field.name_in_data, field.name_in_data ) field_value = data.get(data_field_name) if (field.type is datetime) and field_value: field_value = utcfromtimestamp(timegm(field_value)) field_value = utc_now if field_value > utc_now else field_value item[field.name] = field_value # Some feeds use <content:encoded> tag for storing the main content, # and that tag is parsed differently. If the body_html has not been # found in its default data field and is not aliased, try to # populate it using the aforementioned content field as a fallback. if ( field.name == 'body_html' and not field_value and field.name_in_data not in field_aliases ): try: item['body_html'] = data.content[0].value except Exception: pass # content either non-existent or parsed differently if not data.get('guidislink') and data.get('link'): item['uri'] = data['link'] scheme, netloc, path, query, fragment = urlsplit(item['uri']) if data.get('guid'): item['guid'] = generate_tag(domain=netloc, id=data.get('guid')) else: item['guid'] = generate_tag_from_url(data['link']) if item.get('uri', None): if not item.get('body_html', None): item['body_html'] = '' item['body_html'] = '<p><a href="%s" target="_blank">%s</a></p>' % (item['uri'], source) + item['body_html'] item['dateline'] = { 'source': source, 'date': item.get('firstcreated', item.get('versioncreated')) } return item