def create_tag_items (self, task_id, item_id): tag_items = [] l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', item_id) l.add_value ('tag', 'NRC') tag_items.append(l.load_item()) nrc_tags = self.db.loadNrcTags(task_id) for t in nrc_tags: l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', item_id) l.add_value ('tag', t['tag']) l.add_value ('comment', t['comment']) tag_items.append(l.load_item()) return tag_items
def create_tag(self, feed_entry_id, tag, comment=''): # TODO: create tags l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', tag) l.add_value('comment', comment) return l.load_item()
def process_item(self, task_id): feed = self.db.getRssFeeds(task_id) self.log('processsing rss feed %s (%s)' % (feed['id'], feed['url']), log.INFO) # parse feed feed_data = feedparser.parse(feed['url']) self.log('reading %s feed items' % (len(feed_data['items'])), log.INFO) # update last read time self.db.updateRssFeedLastRead(task_id) # For each item in feed for item in feed_data['items']: # if this item has not already been processed if self.db.rssFeedItemExists(item['id']): self.log( '%s - feed item already exists - skipping' % (item['id']), log.INFO) continue # store the full item l = ItemLoader(RssFeedItem()) l.add_value('item_id', item['id']) l.add_value('content', psycopg2.Binary(pickle.dumps(item))) #l.add_value ('content', pickle.dumps(item)) l.add_value('feed_id', task_id) yield l.load_item() #feed_entry_id = uuid.uuid5(uuid.NAMESPACE_URL, str(item ['id'])) feed_entry_id = self.db.uuid5_str(name=str(item['id'])) l = ItemLoader(FeedEntry()) l.add_value('id', feed_entry_id) l.add_value('title', item['title']) # l.add_value ('updated', format_datetime(item['updated_parsed'])) l.add_value('incident_datetime', format_datetime(item['updated_parsed'])) if 'content' in item: for c in item['content']: l.add_value('content', c['value']) elif 'summary' in item: l.add_value('content', item['summary']) embedded_fields = self.extractContentFields( l.get_output_value('content')) pt = embedded_fields.get('location') or item.get('georss_point') # print "em: '%s' geo: '%s'" % (embedded_fields.get('location'), item.get('georss_point')) if not pt: self.log('%s - No georeference found' % (item['id']), log.WARNING) continue pt = re.split("[, ]+", pt) l.add_value('lat', pt[0]) l.add_value('lng', pt[1]) l.add_value('kml_url', embedded_fields.get('kml') or '') for link in item['links']: if link['rel'] == 'alternate': l.add_value('link', link['href']) l.add_value('source_id', feed['source_id']) yield l.load_item() if 'tags' in item: for t in item['tags']: l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', t['term']) l.add_value('comment', t['label']) yield l.load_item() l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', feed['tag']) yield l.load_item() # update task status self.item_completed(task_id)