def getCandidateDocsThroughClusters(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) for cluster in clusters: if len(cluster) > 1: closeDocs = [] for doc in cluster: closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc) closeDocs = list(set(closeDocs)) for (doc1, doc2) in itertools.product(cluster, closeDocs): try: _tryGetDocDistance(distances, doc1, doc2) logging.info("Docs %s and %s already compared. %s", doc1, doc2, jobInfo) except KeyError: if doc1 != doc2: job = WorkerJob( JOB_COMPAREDOCS, { JOBARG_COMPAREDOCS_DOC1ID: doc1, JOBARG_COMPAREDOCS_DOC2ID: doc2 }) jobManager.enqueueJob(job) logging.info( "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s", job.jobId, doc1, doc2, jobInfo)
def pushLinkJobs(): """ Cleanup old links in the links table. Run this job periodically. """ jobManager = MinerJobManager() linkManager = LinkManager() if jobManager.count() > 100: logging.info("Skipping. Too many jobs queued already!!") return; logging.info("Getting unprocessed links.") links = linkManager.getUnprocessedLinks() nLinks = 0; for linkId in links: processLinkJob = WorkerJob( JOB_PROCESSLINK, { JOBARG_PROCESSLINK_LINKID : linkId}) jobManager.enqueueJob(processLinkJob) logging.info( "Process link job with jobId '%s' put for linkId: %s.", processLinkJob.jobId, linkId) nLinks = nLinks + 1 logging.info("Number of process link jobs were: %i", nLinks)
def cleanUpDistanceTable(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() docList = list(clusterManager.getCurrentDocs()) distances = list(distanceTableManager.getEntries()) staleDocs = [] for entry in distances: staleDoc = "" if entry[0] not in docList: staleDocs.append(entry[0]) elif entry[1] not in docList: staleDocs.append(entry[1]) staleDocs = list(set(staleDocs)) for docKey in staleDocs: job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey}) jobManager.enqueueJob(job) logging.info("Put cleanup doc job with id %s for docId: %s. %s", job.jobId, docKey, jobInfo) logging.info("Number of stale entries in distances table: %i. %s", len(staleDocs), jobInfo)
def pushFeedJobs(): """ Push feed processing jobs to job queue. """ jobManager = MinerJobManager() feedManager = FeedManager() if jobManager.count() > 50: logging.info("Skipping. Too many jobs queued already!!") return logging.info("Getting stale feeds.") staleFeeds = feedManager.getStaleFeeds() nStaleFeeds = 0 for feed in staleFeeds: processFeedJob = WorkerJob(JOB_PROCESSFEED, {JOBARG_PROCESSFEED_FEEDID: feed}) jobManager.enqueueJob(processFeedJob) logging.info("Process feed job put for feedId: %s. Job id: %s", feed, processFeedJob.jobId) nStaleFeeds = nStaleFeeds + 1 logging.info("Number of stale feeds are: %i", nStaleFeeds)
def deepCleanStaleDocs(): """ Puts cleanup doc jobs for stale entries in shingles table Run this job rarely. """ docManager = DocManager() jobManager = MinerJobManager() shingleTableManager = ShingleTableManager() docsToBeCleanedUp = [] logging.info("Started scanning the shingle table") scanResults = shingleTableManager.scan() for entry in scanResults: try: docManager.get(entry[0]) except S3ResponseError: staleDocId = entry[0] staleShingle = entry[1] logging.info("Stale entry found -> docId: %s, shingle: %s", staleDocId, staleShingle) if staleDocId not in docsToBeCleanedUp: docsToBeCleanedUp.append(staleDocId) job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: staleDocId}) jobManager.enqueueJob(job) logging.info("Put cleanup doc job for docId: %s. Job id: %s", staleDocId, job.jobId) logging.info("Number of stale docs deleted were: %i", len(list(docsToBeCleanedUp)))
def putComareDocJobs(docId, matches, docAndJobId): jobManager = MinerJobManager() for match in matches: if match != docId: job = WorkerJob(JOB_COMPAREDOCS, { JOBARG_COMPAREDOCS_DOC1ID: docId, JOBARG_COMPAREDOCS_DOC2ID: match }) jobManager.enqueueJob(job) logging.info( "Put compare docs job with jobid: %s. compared docId: %s. %s", job.jobId, match, docAndJobId)
def parseDoc(jobId, docId): docAndJobId = "Doc id: " + docId + ". Job id: " + jobId logger.info("Started parsing doc. %s.", docAndJobId) docManager = DocManager() doc = docManager.get(docId) # compute and put shingles if (doc.tags[FEEDTAG_LANG] == LANG_ENGLISH): shingles = th.getShingles(getDocEnglishSummaryText(doc), 3, 3) shingles = shingles + th.getShingles(getDocEnglishContent(doc), 3, 3) logger.info("Completed getting shingles. %s.", docAndJobId) shingles = list(set(shingles)) logger.info("Number of unique shingles are %i. %s.", len(shingles), docAndJobId) shingleTableManager = ShingleTableManager() shingleTableManager.addEntries(docId, shingles) logger.info("Added shingles to shingle table. %s.", docAndJobId) # compute and put entities entities = th.getEntities(getDocEnglishTitle(doc)) + \ th.getEntities(getDocEnglishSummaryText(doc)) + \ th.getEntities(getDocEnglishContent(doc)) entities = list(set(entities)) logger.info("Completed getting entities. %s.", docAndJobId) logger.info("Number of unique entities are %i. %s.", len(entities), docAndJobId) entityTableManager = EntityTableManager() entityTableManager.addEntries(docId, entities) logger.info("Added entities to entity table. %s.", docAndJobId) #store entity weights in the doc entityWeights = {} for entity in entities: entityWeight = entityTableManager.getEntityWeight(entity) entityWeights[entity] = entityWeight doc.tags[DOCTAG_ENTITY_WEIGHTS] = json.dumps(entityWeights) docManager.put(doc) logger.info("Added entity weights to doc. %s.", docAndJobId) job = WorkerJob(JOB_GETCANDIDATEDOCS, {JOBARG_GETCANDIDATEDOCS_DOCID: docId}) jobManager = MinerJobManager() jobManager.enqueueJob(job) logging.info("Put get candidate doc job with jobId: %s. %s", job.jobId, docAndJobId) logger.info("Completed parsing doc. %s.", docAndJobId)
def _putNewLinks(feedAndJobId, linksToAdd): linkManager = LinkManager() jobManager = MinerJobManager() latestPubTime = 0 for link in linksToAdd: try: existingLink = linkManager.get(link.id) logger.info( "Link with id '%s' already exists. Not processing it. %s", link.id, feedAndJobId) continue except: pass linkManager.put(link) logger.info( "Put link with id '%s' in links database. %s.", link.id, feedAndJobId) if latestPubTime < link.tags[LINKTAG_PUBTIME]: latestPubTime = link.tags[LINKTAG_PUBTIME] return latestPubTime;
def archiveStaleDocs(): """ Remove the docs fro current working set Run this job periodically. """ clusterManager = ClusterManager() jobManager = MinerJobManager() logging.info("Archiving old clusters.") staleClusters = clusterManager.archiveOldClusters() for cluster in staleClusters: for docKey in cluster: job = WorkerJob(JOB_CLEANUPDOC, { JOBARG_CLEANUPDOC_DOCID : docKey}) jobManager.enqueueJob(job) logging.info( "Put cleanup doc job for docId: %s. Job id: %s", docKey, job.jobId) logging.info("Archived old clusters and cleaned up docs in them from working set.")
def cleanUpDoc(jobId, docId): docAndJobId = "Doc id: " + docId + ". Job id: " + jobId logger.info("Started cleaning up doc. %s.", docAndJobId) jobManager = MinerJobManager() job = WorkerJob(JOB_CLEANUPDOCSHINGLES, {JOBARG_CLEANUPDOCSHINGLES_DOCID: docId}) jobManager.enqueueJob(job) logging.info("Put cleanup doc shingles job. %s.", docAndJobId) job = WorkerJob(JOB_CLEANUPDOCENTITIES, {JOBARG_CLEANUPDOCENTITIES_DOCID: docId}) jobManager.enqueueJob(job) logging.info("Put cleanup doc entities job. %s.", docAndJobId) job = WorkerJob(JOB_CLEANUPDOCDISTANCES, {JOBARG_CLEANUPDOCDISTANCES_DOCID: docId}) jobManager.enqueueJob(job) logging.info("Put cleanup doc distances job. %s.", docAndJobId)
def processLink(jobId, linkId): """ Processes a link(takes as input the linkId) Steps: 1. get link from database 2. get publisher for that link from database 3. get html for that link 4. process that html to generate doc 5. save that doc in docstore. 6. update the link's is processed tag. """ linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId logger.info("Started processing link. %s.", linkAndJobId) # get the link linkManager = LinkManager() link = linkManager.get(linkId) logger.info("Got link from database. %s.", linkAndJobId) # get the publisher publisherManager = PublisherManager() publisher = publisherManager.get(link.tags[TAG_PUBLISHER]) logger.info("Got publisher from database. Publisher id: %s. %s.", link.tags[TAG_PUBLISHER], linkAndJobId) # get html for the link processingResult = _processHtmlForLink(jobId, link, publisher) if not processingResult[0]: logger.warning("No text extracted for the link. %s.", linkAndJobId) # generate corresponding doc doc = Doc(_getDocKey(link), processingResult[0], link.tags) doc.tags[TAG_IMAGES] = processingResult[1] doc.tags[DOCTAG_URL] = linkId doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher) doc = _addTranslationTags(jobId, doc) doc = _addSummaryIfNotPresent(doc) doc.tags[LINKTAG_HIGHLIGHTS] = _getDocHighlights(doc) # save the doc docManager = DocManager() docManager.put(doc) logger.info("Document generated and saved for link. Doc key %s. %s.", doc.key, linkAndJobId) #update the doc key in links table link.tags[LINKTAG_DOCKEY] = doc.key linkManager.put(link) # put parse doc job parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key}) jobManager = MinerJobManager() jobManager.enqueueJob(parseDocJob) logger.info("Parse doc job with with jobId '%s' put. %s.", parseDocJob.jobId, linkAndJobId) if FEEDTAG_DO_NOT_CLUSTER not in doc.tags: newCluster = Cluster([doc.key]) processNewClusterJob = WorkerJob( JOB_PROCESSNEWCLUSTER, {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)}) clusterJobManager = ClusterJobManager() clusterJobManager.enqueueJob(processNewClusterJob) logging.info( "Put process new cluster job for new doc. Cluster id: %s. %s", newCluster.id, linkAndJobId) # update the link link.tags[LINKTAG_ISPROCESSED] = 'true' linkManager.put(link) logger.info("Link updated after being successfully processed. %s.", linkAndJobId) logger.info("Completed processing link. %s.", linkAndJobId)