def _processHtmlForLink(jobId, link, publisher):
    linkAndJobId = "Link id: " + link.id + ". Job id: " + jobId
    linkId = link.id

    pageStaticHtml = link.getHtmlStatic()
    logger.info("Got static html for the link. %s.", linkAndJobId)
    resultStatic = hp.processHtml(
        jobId, pageStaticHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR],
        json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId)

    if len(resultStatic[0]) > 500 and len(resultStatic[1]) > 0:
        logger.info("Text and images extracted using static html. %s.",
                    linkAndJobId)
        return resultStatic

    pageDynamicHtml = link.getHtmlDynamic()
    logger.info("Got dynamic html for the link. %s.", linkAndJobId)
    resultDynamic = hp.processHtml(
        jobId, pageDynamicHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR],
        json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId)

    text = resultDynamic[0]
    if len(text) < resultStatic[0]:
        text = resultStatic[0]
    images = list(set(resultStatic[1] + resultDynamic[1]))
    return (text, images)
Exemple #2
0
def _retrieveNewTagsFromFeedEntry(jobId, entry):
  """
  Process the summary detail of rss feed entry.
  Computes tags for the link object being prepared from the feed entry.
  """

  newTags = {};

  # add title
  newTags[LINKTAG_TITLE] = entry.title

  # add summary and image tags
  processingResult = hp.processHtml(
      jobId,
      entry.summary,
      ":not(script)",
      ["img"]);
  newTags[LINKTAG_SUMMARY] = entry.summary;
  newTags[LINKTAG_SUMMARYTEXT] = processingResult[0];
  newTags[LINKTAG_SUMMARYIMAGES] = processingResult[1];

  if entry.published_parsed:
    newTags[LINKTAG_PUBTIME] = calendar.timegm(entry.published_parsed);
  else:
    newTags[LINKTAG_PUBTIME] = int(time.time())

  newTags[LINKTAG_ISPROCESSED] = 'false'
  return newTags
Exemple #3
0
def _linkFromWebPageEntry(jobId, entry, feed, entrySelector):
  """
  Creates a link from a web page entry.
  """

  # Propogate tags from feed to link object
  linkTags = dict(feed.tags)
  _deleteUnecessaryFeedTags(linkTags)

  # Try and extract the title. If unsuccessful, just return None.
  extractTitleResult = hp.extractLink(
    jobId,
    entry,
    entrySelector['title'],
    feed.tags[FEEDTAG_URL])
  if not extractTitleResult:
    return None;
  link = extractTitleResult[0];
  linkTags[LINKTAG_TITLE] = extractTitleResult[1];

  if 'titleText' in entrySelector:
    logger.info("titleText selector specified. Using it. %s", jobId)
    linkTags[LINKTAG_TITLE] = hp.extractText(
      jobId,
      entry,
      entrySelector['titleText'],
      None)
    if not linkTags[LINKTAG_TITLE]:
      return None;

  # add summary and image tags
  processingResult = hp.processHtml(
      jobId,
      entry,
      entrySelector['summary'],
      entrySelector['image'],
      feed.tags[FEEDTAG_URL]);
  linkTags[LINKTAG_SUMMARY] = entry;
  linkTags[LINKTAG_SUMMARYTEXT] = processingResult[0];
  linkTags[LINKTAG_SUMMARYIMAGES] = processingResult[1];

  linkTags[LINKTAG_PUBTIME] = int(time.time())
  linkTags[LINKTAG_ISPROCESSED] = 'false'

  try:
    # Return the final link object
    return Link(link, linkTags)
  except Exception as e:
    logger.info("Could not open link %s. Job id: %s", link, jobId)
    return None;
Exemple #4
0
def processLink(jobId, linkId):
    """
  Processes a link(takes as input the linkId)

  Steps:
  1. get link from database
  2. get publisher for that link from database
  3. get html for that link
  4. process that html to generate doc
  5. save that doc in docstore.
  6. update the link's is processed tag.
  """

    linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId
    logger.info("Started processing link. %s.", linkAndJobId)

    # get the link
    linkManager = LinkManager()
    link = linkManager.get(linkId)
    logger.info("Got link from database. %s.", linkAndJobId)

    # get the publisher
    publisherManager = PublisherManager()
    publisher = publisherManager.get(link.tags[TAG_PUBLISHER])
    logger.info("Got publisher from database. Publisher id: %s. %s.",
                link.tags[TAG_PUBLISHER], linkAndJobId)

    # get html for the link
    pageHtml = link.getHtml()
    logger.info("Got html for the link. %s.", linkAndJobId)

    # process that html
    processingResult = hp.processHtml(
        jobId, pageHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR],
        json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId)
    if not processingResult[0]:
        logger.warning("No text extracted for the link. %s.", linkAndJobId)

    # generate corresponding doc
    doc = Doc(_getDocKey(link), processingResult[0], link.tags)
    doc.tags[TAG_IMAGES] = processingResult[1]
    doc.tags[DOCTAG_URL] = linkId
    doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher)
    if LINKTAG_SUMMARYTEXT not in doc.tags:
        doc.tags[LINKTAG_SUMMARYTEXT] = doc.content[:200]
    doc = _addTranslationTags(jobId, doc)

    # save the doc
    docManager = DocManager()
    docManager.put(doc)
    logger.info("Document generated and saved for link. Doc key %s. %s.",
                doc.key, linkAndJobId)

    #update the doc key in links table
    link.tags[LINKTAG_DOCKEY] = doc.key
    linkManager.put(link)

    # put parse doc job
    parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key})
    jobManager = MinerJobManager()
    jobManager.enqueueJob(parseDocJob)
    logger.info("Parse doc job with with jobId '%s' put. %s.",
                parseDocJob.jobId, linkAndJobId)

    if FEEDTAG_DO_NOT_CLUSTER not in doc.tags:
        newCluster = Cluster([doc.key])
        processNewClusterJob = WorkerJob(
            JOB_PROCESSNEWCLUSTER,
            {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)})
        clusterJobManager = ClusterJobManager()
        clusterJobManager.enqueueJob(processNewClusterJob)
        logging.info(
            "Put process new cluster job for new doc. Cluster id: %s. %s",
            newCluster.id, linkAndJobId)

    # update the link
    link.tags[LINKTAG_ISPROCESSED] = 'true'
    linkManager.put(link)
    logger.info("Link updated after being successfully processed. %s.",
                linkAndJobId)

    logger.info("Completed processing link. %s.", linkAndJobId)