def notifyTwitterForLocale(locale): jobId = "notifyTwitterForLocale" + locale nt = NotifierTwitter() notificationTableManager = NotificationTableManager() clusterManager = ClusterManager() if not nt.doesLocaleExist(locale): logging.info("No twitter handle exists for locale %s. %s", locale, jobId) return #skip if nt.isNightTime(locale): logging.info("Night time for locale %s. %s", locale, jobId) return #skip logging.info("Fetching notifiable clusters for locale %s. %s", locale, jobId) clusters = clusterManager.getNotfiableClustersForLocale(jobId, locale) logging.info("Fetched notifiable clusters for locale %s. %s", locale, jobId) clustersToNotify = [cluster for cluster in clusters if \ not notificationTableManager.isClusterNotified(cluster, Notifier.twitter)] logging.info("Number of unnotified clusters are: %i. %s", len(clustersToNotify), jobId) for cluster in clustersToNotify[:2]: cluster = clusterManager.getProcessedCluster(cluster) try: nt.notifyForLocales(jobId, cluster) notificationTableManager.setClusterNotified( cluster, Notifier.twitter) except: logging.exception('Failed to tweet story for cluster %s. %s', cluster.id, jobId)
def getCandidateDocsThroughClusters(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) for cluster in clusters: if len(cluster) > 1: closeDocs = [] for doc in cluster: closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc) closeDocs = list(set(closeDocs)) for (doc1, doc2) in itertools.product(cluster, closeDocs): try: _tryGetDocDistance(distances, doc1, doc2) logging.info("Docs %s and %s already compared. %s", doc1, doc2, jobInfo) except KeyError: if doc1 != doc2: job = WorkerJob( JOB_COMPAREDOCS, { JOBARG_COMPAREDOCS_DOC1ID: doc1, JOBARG_COMPAREDOCS_DOC2ID: doc2 }) jobManager.enqueueJob(job) logging.info( "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s", job.jobId, doc1, doc2, jobInfo)
def cleanUpDistanceTable(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() docList = list(clusterManager.getCurrentDocs()) distances = list(distanceTableManager.getEntries()) staleDocs = [] for entry in distances: staleDoc = "" if entry[0] not in docList: staleDocs.append(entry[0]) elif entry[1] not in docList: staleDocs.append(entry[1]) staleDocs = list(set(staleDocs)) for docKey in staleDocs: job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey}) jobManager.enqueueJob(job) logging.info("Put cleanup doc job with id %s for docId: %s. %s", job.jobId, docKey, jobInfo) logging.info("Number of stale entries in distances table: %i. %s", len(staleDocs), jobInfo)
def processNewCluster(jobId, clusterDocs): cluster = Cluster(clusterDocs) clusterAndJobId = "Cluster id: " + cluster.id + ". Job id: " + jobId logger.info("Started processing new cluster. %s.", clusterAndJobId) clusterManager = ClusterManager() clusterManager.processNewCluster(cluster) logger.info("Completed processing new cluster. %s.", clusterAndJobId)
def get_story(docId): filters = validateFilters(request.args) clusterManager = ClusterManager() cluster = clusterManager.queryByDocId(docId.upper(), filters) if not cluster: abort(404) else: return getJsonResponse(cluster)
def clusterDocs(jobId): jobInfo = "Job id: " + jobId logger.info("Started clustering docs. %s.", jobInfo) distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) logger.info("Started clustering. %s.", jobInfo) clusterHierarchical(jobInfo, clusters, distances) logger.info("Finished clustering. %s.", jobInfo) clusterManager.putCurrentClusters(clusters) logger.info("Put the computed clusters. %s.", jobInfo)
def loadStoryPage(docId): clusterManager = ClusterManager() cluster = clusterManager.queryByDocId(docId.upper()) if not cluster: return redirect(url_for('home')) imagesByArticle = [article['images'] for article in cluster['articles']] allImages = [ image for articleImages in imagesByArticle for image in articleImages ] image = allImages[0] if allImages else None return render_template('home.html', title=cluster['title'], description=cluster['description'], locationsMetadata=json.dumps(LOCATION_METADATA), languagesMetadata=json.dumps(AVAILABLE_LANGUAGES), metaTags=getSharingMetaTags(cluster['title'], cluster['description'], image, 'article'))
def archiveStaleDocs(): """ Remove the docs fro current working set Run this job periodically. """ clusterManager = ClusterManager() jobManager = MinerJobManager() logging.info("Archiving old clusters.") staleClusters = clusterManager.archiveOldClusters() for cluster in staleClusters: for docKey in cluster: job = WorkerJob(JOB_CLEANUPDOC, { JOBARG_CLEANUPDOC_DOCID : docKey}) jobManager.enqueueJob(job) logging.info( "Put cleanup doc job for docId: %s. Job id: %s", docKey, job.jobId) logging.info("Archived old clusters and cleaned up docs in them from working set.")
def get_stories(): countryFilter = request.args.get('country') categoryFilter = request.args.get('category') localeFilter = request.args.get('locale') (skip, top) = validateSkipAndTop(request.args.get('skip'), request.args.get('top')) filters = validateFilters(request.args) clusterManager = ClusterManager() if localeFilter and not (countryFilter or categoryFilter): localeFilter = validateLocale(localeFilter) return getJsonResponse( clusterManager.queryByLocale(localeFilter, skip, top, filters)) elif (countryFilter and categoryFilter) and not localeFilter: countryFilter = validateCountry(countryFilter) categoryFilter = validateCategory(categoryFilter) return getJsonResponse( clusterManager.queryByCategoryAndCountry(categoryFilter, countryFilter, skip, top, filters)) abort(400, "Invalid query")
def reprocessCurrentClusters(): clusterManager = ClusterManager() logging.info("Started Reprocessing current clusters.") staleClusters = clusterManager.reprocessCurrentClusters() logging.info("Completed Reprocessing current clusters")