def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = 0 if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \ (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH): score = computeEnglishDocsSimScore(doc1, doc2) logger.info("Comparing using shingles. %s", jobInfo) else: score = computeDocSimScoreUsingEntities(doc1, doc2) logger.info("Comparing using entities. %s", jobInfo) if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \ doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]: logger.info( "The two docs are from different locations. Adding penalty. %s", jobInfo) score = score - 0.4 if score < 0: score = 0 logger.info("Comparision score: %s. %s", str(score), jobInfo) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)
def deepCleanStaleDocs(): """ Puts cleanup doc jobs for stale entries in shingles table Run this job rarely. """ docManager = DocManager() jobManager = MinerJobManager() shingleTableManager = ShingleTableManager() docsToBeCleanedUp = [] logging.info("Started scanning the shingle table") scanResults = shingleTableManager.scan() for entry in scanResults: try: docManager.get(entry[0]) except S3ResponseError: staleDocId = entry[0] staleShingle = entry[1] logging.info("Stale entry found -> docId: %s, shingle: %s", staleDocId, staleShingle) if staleDocId not in docsToBeCleanedUp: docsToBeCleanedUp.append(staleDocId) job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: staleDocId}) jobManager.enqueueJob(job) logging.info("Put cleanup doc job for docId: %s. Job id: %s", staleDocId, job.jobId) logging.info("Number of stale docs deleted were: %i", len(list(docsToBeCleanedUp)))
def __init__(self): """ Instantiates a new instance of ClusterManager class """ self.clusterTableManager = ClusterTableManager() self.docManager = DocManager()
def process(self): """ Processes the cluster to include metadata of consisting documents, and overall metadata of cluster like category, location, feeds, etc. """ self.categories = [] self.countries = [] self.locales = [] self.publishers = [] self.languages = [] self.articles = [] # contains non-duplicate articles self.duplicates = [] self.lastPubTime = 0 docManager = DocManager() docsAdded = [] for docKey in super(Cluster, self).__iter__(): doc = docManager.get(docKey) if not _isDuplicateArticle(docKey, docsAdded): self.articles.append({ 'title': doc.tags.get(LINKTAG_TITLE, ""), 'publisher': doc.tags.get(TAG_PUBLISHER_DETAILS, ""), 'link': doc.tags.get(DOCTAG_URL, "#"), 'summaryText': doc.tags.get(LINKTAG_SUMMARYTEXT, ""), 'images': _getImagesForDoc(doc), 'lang': doc.tags.get(FEEDTAG_LANG, ""), 'publishedOn': doc.tags.get(LINKTAG_PUBTIME, 0) }) docsAdded.append(docKey) else: self.duplicates.append(docKey) if doc.tags.get(FEEDTAG_CATEGORY): self.categories.append(doc.tags[FEEDTAG_CATEGORY]) if doc.tags.get(FEEDTAG_COUNTRY): self.countries.append(doc.tags[FEEDTAG_COUNTRY]) if doc.tags.get(FEEDTAG_LOCALE): self.locales.append(doc.tags[FEEDTAG_LOCALE]) if doc.tags.get(TAG_PUBLISHER): self.publishers.append(doc.tags[TAG_PUBLISHER]) if doc.tags.get(FEEDTAG_LANG): self.languages.append(doc.tags[FEEDTAG_LANG]) if doc.tags.get(LINKTAG_PUBTIME, 0) > self.lastPubTime: self.lastPubTime = doc.tags.get(LINKTAG_PUBTIME) nArticles = len(self.articles) + len(self.duplicates) #remove duplicates self.categories = _removeDuplicatesAndOutliers(self.categories, nArticles) self.countries = _removeDuplicatesAndOutliers(self.countries, nArticles) self.locales = _removeDuplicatesAndOutliers(self.locales, nArticles) self.publishers = list(set(self.publishers)) self.languages = list(set(self.languages))
def parseDoc(jobId, docId): docAndJobId = "Doc id: " + docId + ". Job id: " + jobId logger.info("Started parsing doc. %s.", docAndJobId) docManager = DocManager() doc = docManager.get(docId) # compute and put shingles if (doc.tags[FEEDTAG_LANG] == LANG_ENGLISH): shingles = th.getShingles(getDocEnglishSummaryText(doc), 3, 3) shingles = shingles + th.getShingles(getDocEnglishContent(doc), 3, 3) logger.info("Completed getting shingles. %s.", docAndJobId) shingles = list(set(shingles)) logger.info("Number of unique shingles are %i. %s.", len(shingles), docAndJobId) shingleTableManager = ShingleTableManager() shingleTableManager.addEntries(docId, shingles) logger.info("Added shingles to shingle table. %s.", docAndJobId) # compute and put entities entities = th.getEntities(getDocEnglishTitle(doc)) + \ th.getEntities(getDocEnglishSummaryText(doc)) + \ th.getEntities(getDocEnglishContent(doc)) entities = list(set(entities)) logger.info("Completed getting entities. %s.", docAndJobId) logger.info("Number of unique entities are %i. %s.", len(entities), docAndJobId) entityTableManager = EntityTableManager() entityTableManager.addEntries(docId, entities) logger.info("Added entities to entity table. %s.", docAndJobId) #store entity weights in the doc entityWeights = {} for entity in entities: entityWeight = entityTableManager.getEntityWeight(entity) entityWeights[entity] = entityWeight doc.tags[DOCTAG_ENTITY_WEIGHTS] = json.dumps(entityWeights) docManager.put(doc) logger.info("Added entity weights to doc. %s.", docAndJobId) job = WorkerJob(JOB_GETCANDIDATEDOCS, {JOBARG_GETCANDIDATEDOCS_DOCID: docId}) jobManager = MinerJobManager() jobManager.enqueueJob(job) logging.info("Put get candidate doc job with jobId: %s. %s", job.jobId, docAndJobId) logger.info("Completed parsing doc. %s.", docAndJobId)
def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = getDocComparisionScore(jobInfo, doc1, doc2) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)
def _getDocsInParallel(docKeys): que = Queue.Queue() threads_list = list() docManager = DocManager() for docKey in docKeys: t = Thread( target=lambda q, arg1: q.put(docManager.get(arg1)), args=(que, docKey)) t.start() threads_list.append(t) for t in threads_list: t.join() docs = list() while not que.empty(): docs.append(que.get()) return docs
class ClusterManager: """ Manage clusters stored in cloud. """ def __init__(self): """ Instantiates a new instance of ClusterManager class """ self.clusterTableManager = ClusterTableManager() self.docManager = DocManager() self.processedClusterStore = ProcessedClusterStore() def getProcessedCluster(self, cluster): return self.processedClusterStore.getProcessedCluster(cluster) def processNewCluster(self, cluster): cluster.isCurrent = 'true' cluster = self.processedClusterStore.processAndSaveCluster(cluster) self.clusterTableManager.addCluster(cluster) def __getProcessedClusterArticles(self, cluster): cluster = self.getProcessedCluster(cluster) return cluster.articles def __getClusterResponse(self, cluster, filters=None): articles = self.__getProcessedClusterArticles( self.__filterDocsInCluster(cluster, filters)) title = articles[0]['title'] description = articles[0]['title'] + " - " + \ articles[0]['publisher'][PUBLISHER_DETAILS_NAME] + "." if len(articles) > 1: description += " " + articles[1]['title'] + " - " + \ articles[1]['publisher'][PUBLISHER_DETAILS_NAME] + "." return { "articles": articles, "title": title, "description": description, "locales": cluster.locales, "languages": cluster.languages, "importance": self.__computeClusterRankingScore(cluster) } def __computeClusterRankingScore(self, cluster): return (0.3 * (len(cluster) - len(cluster.duplicates))) + \ (0.7 * len(cluster.publishers)) def __sortClustersByImportance(self, clusters): clusterList = list(clusters) clusterList.sort(key=self.__computeClusterRankingScore, reverse=True) return clusterList def __filterClusters(self, clusterList, filters): if not filters: return clusterList if CLUSTERS_FILTER_LANGUAGES in filters: clusterList = [cluster for cluster in clusterList if not \ set(filters[CLUSTERS_FILTER_LANGUAGES]).isdisjoint(cluster.languages)] return clusterList def __filterDocsInCluster(self, cluster, filters): if not filters: return cluster filteredDocs = [] for docKey in cluster: isDocAllowed = True doc = self.docManager.get(docKey) if CLUSTERS_FILTER_LANGUAGES in filters: if doc.tags[FEEDTAG_LANG] not in filters[ CLUSTERS_FILTER_LANGUAGES]: isDocAllowed = False if isDocAllowed: filteredDocs.append(docKey) return Cluster(filteredDocs) def __constructQueryResponse(self, clusters, skip, top, filters=None): response = [] clusterList = list(clusters) clusterList = self.__filterClusters(clusterList, filters) clusterList = self.__sortClustersByImportance(clusterList) for cluster in clusterList[skip:(skip + top)]: try: response.append(self.__getClusterResponse(cluster, filters)) except Exception, e: logging.exception( "Could not construct query response for cluster id %s", cluster.id) continue return response
def processLink(jobId, linkId): """ Processes a link(takes as input the linkId) Steps: 1. get link from database 2. get publisher for that link from database 3. get html for that link 4. process that html to generate doc 5. save that doc in docstore. 6. update the link's is processed tag. """ linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId logger.info("Started processing link. %s.", linkAndJobId) # get the link linkManager = LinkManager() link = linkManager.get(linkId) logger.info("Got link from database. %s.", linkAndJobId) # get the publisher publisherManager = PublisherManager() publisher = publisherManager.get(link.tags[TAG_PUBLISHER]) logger.info("Got publisher from database. Publisher id: %s. %s.", link.tags[TAG_PUBLISHER], linkAndJobId) # get html for the link processingResult = _processHtmlForLink(jobId, link, publisher) if not processingResult[0]: logger.warning("No text extracted for the link. %s.", linkAndJobId) # generate corresponding doc doc = Doc(_getDocKey(link), processingResult[0], link.tags) doc.tags[TAG_IMAGES] = processingResult[1] doc.tags[DOCTAG_URL] = linkId doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher) doc = _addTranslationTags(jobId, doc) doc = _addSummaryIfNotPresent(doc) doc.tags[LINKTAG_HIGHLIGHTS] = _getDocHighlights(doc) # save the doc docManager = DocManager() docManager.put(doc) logger.info("Document generated and saved for link. Doc key %s. %s.", doc.key, linkAndJobId) #update the doc key in links table link.tags[LINKTAG_DOCKEY] = doc.key linkManager.put(link) # put parse doc job parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key}) jobManager = MinerJobManager() jobManager.enqueueJob(parseDocJob) logger.info("Parse doc job with with jobId '%s' put. %s.", parseDocJob.jobId, linkAndJobId) if FEEDTAG_DO_NOT_CLUSTER not in doc.tags: newCluster = Cluster([doc.key]) processNewClusterJob = WorkerJob( JOB_PROCESSNEWCLUSTER, {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)}) clusterJobManager = ClusterJobManager() clusterJobManager.enqueueJob(processNewClusterJob) logging.info( "Put process new cluster job for new doc. Cluster id: %s. %s", newCluster.id, linkAndJobId) # update the link link.tags[LINKTAG_ISPROCESSED] = 'true' linkManager.put(link) logger.info("Link updated after being successfully processed. %s.", linkAndJobId) logger.info("Completed processing link. %s.", linkAndJobId)
class ClusterManager: """ Manage clusters stored in cloud. """ def __init__(self): """ Instantiates a new instance of ClusterManager class """ self.clusterTableManager = ClusterTableManager() self.docManager = DocManager() def processNewCluster(self, cluster): cluster.process() cluster.isCurrent = 'true' self.clusterTableManager.addCluster(cluster) def __getProcessedClusterArticles(self, cluster): cluster.process() return cluster.articles def __computeClusterRankingScore(self, cluster): return (0.4 * (len(cluster) - len(cluster.duplicates))) + \ (0.6 * len(cluster.publishers)) def __filterClusters(self, clusterList, filters): if not filters: return clusterList if CLUSTERS_FILTER_LANGUAGES in filters: clusterList = [cluster for cluster in clusterList if not \ set(filters[CLUSTERS_FILTER_LANGUAGES]).isdisjoint(cluster.languages)] return clusterList def __filterDocsInCluster(self, cluster, filters): if not filters: return cluster filteredDocs = [] for docKey in cluster: isDocAllowed = True doc = self.docManager.get(docKey) if CLUSTERS_FILTER_LANGUAGES in filters: if doc.tags[FEEDTAG_LANG] not in filters[ CLUSTERS_FILTER_LANGUAGES]: isDocAllowed = False if isDocAllowed: filteredDocs.append(docKey) return Cluster(filteredDocs) def __constructQueryResponse(self, clusters, skip, top, filters=None): response = [] clusterList = list(clusters) clusterList = self.__filterClusters(clusterList, filters) clusterList.sort(key=self.__computeClusterRankingScore, reverse=True) for cluster in clusterList[skip:(skip + top)]: try: response.append({ "articles": self.__getProcessedClusterArticles( self.__filterDocsInCluster(cluster, filters)), "importance": self.__computeClusterRankingScore(cluster) }) except Exception, e: logging.exception( "Could not construct query response for cluster id %s", cluster.id) continue return response