def _processHtmlForLink(jobId, link, publisher): linkAndJobId = "Link id: " + link.id + ". Job id: " + jobId linkId = link.id pageStaticHtml = link.getHtmlStatic() logger.info("Got static html for the link. %s.", linkAndJobId) resultStatic = hp.processHtml( jobId, pageStaticHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR], json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId) if len(resultStatic[0]) > 500 and len(resultStatic[1]) > 0: logger.info("Text and images extracted using static html. %s.", linkAndJobId) return resultStatic pageDynamicHtml = link.getHtmlDynamic() logger.info("Got dynamic html for the link. %s.", linkAndJobId) resultDynamic = hp.processHtml( jobId, pageDynamicHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR], json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId) text = resultDynamic[0] if len(text) < resultStatic[0]: text = resultStatic[0] images = list(set(resultStatic[1] + resultDynamic[1])) return (text, images)
def _retrieveNewTagsFromFeedEntry(jobId, entry): """ Process the summary detail of rss feed entry. Computes tags for the link object being prepared from the feed entry. """ newTags = {}; # add title newTags[LINKTAG_TITLE] = entry.title # add summary and image tags processingResult = hp.processHtml( jobId, entry.summary, ":not(script)", ["img"]); newTags[LINKTAG_SUMMARY] = entry.summary; newTags[LINKTAG_SUMMARYTEXT] = processingResult[0]; newTags[LINKTAG_SUMMARYIMAGES] = processingResult[1]; if entry.published_parsed: newTags[LINKTAG_PUBTIME] = calendar.timegm(entry.published_parsed); else: newTags[LINKTAG_PUBTIME] = int(time.time()) newTags[LINKTAG_ISPROCESSED] = 'false' return newTags
def _linkFromWebPageEntry(jobId, entry, feed, entrySelector): """ Creates a link from a web page entry. """ # Propogate tags from feed to link object linkTags = dict(feed.tags) _deleteUnecessaryFeedTags(linkTags) # Try and extract the title. If unsuccessful, just return None. extractTitleResult = hp.extractLink( jobId, entry, entrySelector['title'], feed.tags[FEEDTAG_URL]) if not extractTitleResult: return None; link = extractTitleResult[0]; linkTags[LINKTAG_TITLE] = extractTitleResult[1]; if 'titleText' in entrySelector: logger.info("titleText selector specified. Using it. %s", jobId) linkTags[LINKTAG_TITLE] = hp.extractText( jobId, entry, entrySelector['titleText'], None) if not linkTags[LINKTAG_TITLE]: return None; # add summary and image tags processingResult = hp.processHtml( jobId, entry, entrySelector['summary'], entrySelector['image'], feed.tags[FEEDTAG_URL]); linkTags[LINKTAG_SUMMARY] = entry; linkTags[LINKTAG_SUMMARYTEXT] = processingResult[0]; linkTags[LINKTAG_SUMMARYIMAGES] = processingResult[1]; linkTags[LINKTAG_PUBTIME] = int(time.time()) linkTags[LINKTAG_ISPROCESSED] = 'false' try: # Return the final link object return Link(link, linkTags) except Exception as e: logger.info("Could not open link %s. Job id: %s", link, jobId) return None;
def processLink(jobId, linkId): """ Processes a link(takes as input the linkId) Steps: 1. get link from database 2. get publisher for that link from database 3. get html for that link 4. process that html to generate doc 5. save that doc in docstore. 6. update the link's is processed tag. """ linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId logger.info("Started processing link. %s.", linkAndJobId) # get the link linkManager = LinkManager() link = linkManager.get(linkId) logger.info("Got link from database. %s.", linkAndJobId) # get the publisher publisherManager = PublisherManager() publisher = publisherManager.get(link.tags[TAG_PUBLISHER]) logger.info("Got publisher from database. Publisher id: %s. %s.", link.tags[TAG_PUBLISHER], linkAndJobId) # get html for the link pageHtml = link.getHtml() logger.info("Got html for the link. %s.", linkAndJobId) # process that html processingResult = hp.processHtml( jobId, pageHtml, publisher.tags[PUBLISHERTAG_TEXTSELECTOR], json.loads(publisher.tags[PUBLISHERTAG_IMAGESELECTORS]), linkId) if not processingResult[0]: logger.warning("No text extracted for the link. %s.", linkAndJobId) # generate corresponding doc doc = Doc(_getDocKey(link), processingResult[0], link.tags) doc.tags[TAG_IMAGES] = processingResult[1] doc.tags[DOCTAG_URL] = linkId doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher) if LINKTAG_SUMMARYTEXT not in doc.tags: doc.tags[LINKTAG_SUMMARYTEXT] = doc.content[:200] doc = _addTranslationTags(jobId, doc) # save the doc docManager = DocManager() docManager.put(doc) logger.info("Document generated and saved for link. Doc key %s. %s.", doc.key, linkAndJobId) #update the doc key in links table link.tags[LINKTAG_DOCKEY] = doc.key linkManager.put(link) # put parse doc job parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key}) jobManager = MinerJobManager() jobManager.enqueueJob(parseDocJob) logger.info("Parse doc job with with jobId '%s' put. %s.", parseDocJob.jobId, linkAndJobId) if FEEDTAG_DO_NOT_CLUSTER not in doc.tags: newCluster = Cluster([doc.key]) processNewClusterJob = WorkerJob( JOB_PROCESSNEWCLUSTER, {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)}) clusterJobManager = ClusterJobManager() clusterJobManager.enqueueJob(processNewClusterJob) logging.info( "Put process new cluster job for new doc. Cluster id: %s. %s", newCluster.id, linkAndJobId) # update the link link.tags[LINKTAG_ISPROCESSED] = 'true' linkManager.put(link) logger.info("Link updated after being successfully processed. %s.", linkAndJobId) logger.info("Completed processing link. %s.", linkAndJobId)