def pushFeedJobs(): """ Push feed processing jobs to job queue. """ jobManager = MinerJobManager() feedManager = FeedManager() if jobManager.count() > 50: logging.info("Skipping. Too many jobs queued already!!") return logging.info("Getting stale feeds.") staleFeeds = feedManager.getStaleFeeds() nStaleFeeds = 0 for feed in staleFeeds: processFeedJob = WorkerJob(JOB_PROCESSFEED, {JOBARG_PROCESSFEED_FEEDID: feed}) jobManager.enqueueJob(processFeedJob) logging.info("Process feed job put for feedId: %s. Job id: %s", feed, processFeedJob.jobId) nStaleFeeds = nStaleFeeds + 1 logging.info("Number of stale feeds are: %i", nStaleFeeds)
def processWebFeed(jobId, feed): feedAndJobId = "Feed id: " + feed.id + ". Job id: " + jobId; # get page html pageHtml = ""; if FEEDTAG_IS_FEEDPAGE_STATIC in feed.tags: pageHtml = getHtmlStatic(feed.tags[FEEDTAG_URL]) else: pageHtml = loadPageAndGetHtml(feed.tags[FEEDTAG_URL]) logger.info("Got html for web page. %s.", feedAndJobId) # load entry selectors entrySelectors = json.loads(feed.tags[FEEDTAG_ENTRY_SELECTORS]) logger.info( "Will use %i entry selectors. %s", len(entrySelectors), feedAndJobId) # Use entry selector to get entries linksToAdd = [] for entrySelector in entrySelectors: entries = hp.getSubHtmlEntries(jobId, pageHtml, entrySelector['overall']) logger.info( "Got %i entries for entry selector %s. %s", len(entries), entrySelector['overall'], feedAndJobId) # considering only the top 30 entries to reduce load for entry in entries[:30]: link = _linkFromWebPageEntry(jobId, entry, feed, entrySelector) if link: linksToAdd.append(link); logger.info("Discovered link: %s. %s", link.id, feedAndJobId) if len(linksToAdd) == 0: logger.warning("No links found while processing webPage. %s", feedAndJobId) else: logger.info("Number of links found: %i. %s", len(linksToAdd), feedAndJobId) # put links and processLink jobs latestPubTime = _putNewLinks(feedAndJobId, linksToAdd) if latestPubTime > 0: feed.tags[FEEDTAG_LASTPUBDATE] = latestPubTime # update Feed on successfull poll feedManager = FeedManager() feedManager.updateFeedOnSuccessfullPoll(feed) logger.info( "Feed updated after being successfully processed. %s.", feedAndJobId) logger.info("Completed processing webPage feed. %s.", feedAndJobId)
def processFeed(jobId, feedId): feedAndJobId = "Feed id: " + feedId + ". Job id: " + jobId; logger.info("Started processing rss feed. %s.", feedAndJobId) # get the feed feedManager = FeedManager() feed = feedManager.get(feedId) logger.info("Got feed from database. %s.", feedAndJobId) if feed.tags[FEEDTAG_TYPE] == FEEDTYPE_RSS: processRssFeed(jobId, feed) elif feed.tags[FEEDTAG_TYPE] == FEEDTYPE_WEBPAGE: processWebFeed(jobId, feed)
def processRssFeed(jobId, feed): """ Processes a rss feed (takes as input the feed) """ feedAndJobId = "Feed id: " + feed.id + ". Job id: " + jobId; # compute the last pubDate lastPubDate = 0; if FEEDTAG_LASTPUBDATE in feed.tags: lastPubDate = feed.tags[FEEDTAG_LASTPUBDATE] # get all feed entries since last poll time parsedFeed = feedparser.parse(feed.tags[FEEDTAG_URL]) newEntries = [entry for entry in parsedFeed.entries if (not entry.published_parsed) or (entry.published_parsed > time.gmtime(lastPubDate))] logger.info("Got %i new entries. %s", len(newEntries), feedAndJobId) # for each entry add link in link database and a process link job linksToAdd = [] for entry in newEntries: link = _linkFromFeedEntry(jobId, entry, feed) if link: linksToAdd.append(link); latestPubTime = _putNewLinks(feedAndJobId, linksToAdd) # last step update the feed on successful completion of poll if latestPubTime > 0: feed.tags[FEEDTAG_LASTPUBDATE] = latestPubTime feedManager = FeedManager() feedManager.updateFeedOnSuccessfullPoll(feed) logger.info( "Feed updated after being successfully processed. %s.", feedAndJobId) logger.info("Completed processing rss feed. %s.", feedAndJobId)