Ejemplo n.º 1
0
def pushFeedJobs():
    """
    Push feed processing jobs to job queue.
    """

    jobManager = MinerJobManager()
    feedManager = FeedManager()

    if jobManager.count() > 50:
        logging.info("Skipping. Too many jobs queued already!!")
        return

    logging.info("Getting stale  feeds.")
    staleFeeds = feedManager.getStaleFeeds()

    nStaleFeeds = 0
    for feed in staleFeeds:
        processFeedJob = WorkerJob(JOB_PROCESSFEED,
                                   {JOBARG_PROCESSFEED_FEEDID: feed})
        jobManager.enqueueJob(processFeedJob)
        logging.info("Process feed job put for feedId: %s. Job id: %s", feed,
                     processFeedJob.jobId)
        nStaleFeeds = nStaleFeeds + 1

    logging.info("Number of stale feeds are: %i", nStaleFeeds)
Ejemplo n.º 2
0
def processWebFeed(jobId, feed):
  feedAndJobId = "Feed id: " + feed.id + ". Job id: " + jobId;

  # get page html
  pageHtml = "";
  if FEEDTAG_IS_FEEDPAGE_STATIC in feed.tags:
    pageHtml = getHtmlStatic(feed.tags[FEEDTAG_URL])
  else:
    pageHtml = loadPageAndGetHtml(feed.tags[FEEDTAG_URL])
  logger.info("Got html for web page. %s.", feedAndJobId)

  # load entry selectors
  entrySelectors = json.loads(feed.tags[FEEDTAG_ENTRY_SELECTORS])
  logger.info(
    "Will use %i entry selectors. %s",
    len(entrySelectors),
    feedAndJobId)

  # Use entry selector to get entries
  linksToAdd = []
  for entrySelector in entrySelectors:
    entries = hp.getSubHtmlEntries(jobId, pageHtml, entrySelector['overall'])
    logger.info(
      "Got %i entries for entry selector %s. %s",
      len(entries),
      entrySelector['overall'],
      feedAndJobId)

    # considering only the top 30 entries to reduce load
    for entry in entries[:30]:
      link = _linkFromWebPageEntry(jobId, entry, feed, entrySelector)
      if link:
        linksToAdd.append(link);
        logger.info("Discovered link: %s. %s", link.id, feedAndJobId)

  if len(linksToAdd) == 0:
    logger.warning("No links found while processing webPage. %s", feedAndJobId)
  else:
    logger.info("Number of links found: %i. %s", len(linksToAdd), feedAndJobId)

  # put links and processLink jobs
  latestPubTime = _putNewLinks(feedAndJobId, linksToAdd)
  if latestPubTime > 0:
    feed.tags[FEEDTAG_LASTPUBDATE] = latestPubTime

  # update Feed on successfull poll
  feedManager = FeedManager()
  feedManager.updateFeedOnSuccessfullPoll(feed)
  logger.info(
    "Feed updated after being successfully processed. %s.",
    feedAndJobId)

  logger.info("Completed processing webPage feed. %s.", feedAndJobId)
Ejemplo n.º 3
0
def processFeed(jobId, feedId):
  feedAndJobId = "Feed id: " + feedId + ". Job id: " + jobId;
  logger.info("Started processing rss feed. %s.", feedAndJobId)

  # get the feed
  feedManager = FeedManager()
  feed = feedManager.get(feedId)
  logger.info("Got feed from database. %s.", feedAndJobId)

  if feed.tags[FEEDTAG_TYPE] == FEEDTYPE_RSS:
    processRssFeed(jobId, feed)
  elif feed.tags[FEEDTAG_TYPE] == FEEDTYPE_WEBPAGE:
    processWebFeed(jobId, feed)
Ejemplo n.º 4
0
def processRssFeed(jobId, feed):
  """
  Processes a rss feed (takes as input the feed)
  """

  feedAndJobId = "Feed id: " + feed.id + ". Job id: " + jobId;

  # compute the last pubDate
  lastPubDate = 0;
  if FEEDTAG_LASTPUBDATE in feed.tags:
      lastPubDate = feed.tags[FEEDTAG_LASTPUBDATE]

  # get all feed entries since last poll time
  parsedFeed = feedparser.parse(feed.tags[FEEDTAG_URL])
  newEntries = [entry for entry in parsedFeed.entries
                if (not entry.published_parsed) or (entry.published_parsed > time.gmtime(lastPubDate))]
  logger.info("Got %i new entries. %s", len(newEntries), feedAndJobId)

  # for each entry add link in link database and a process link job
  linksToAdd = []
  for entry in newEntries:
    link = _linkFromFeedEntry(jobId, entry, feed)
    if link:
      linksToAdd.append(link);
  latestPubTime = _putNewLinks(feedAndJobId, linksToAdd)

  # last step update the feed on successful completion of poll
  if latestPubTime > 0:
    feed.tags[FEEDTAG_LASTPUBDATE] = latestPubTime

  feedManager = FeedManager()
  feedManager.updateFeedOnSuccessfullPoll(feed)
  logger.info(
    "Feed updated after being successfully processed. %s.",
    feedAndJobId)

  logger.info("Completed processing rss feed. %s.", feedAndJobId)