Ejemplo n.º 1
0
def notifyTwitterForLocale(locale):
    jobId = "notifyTwitterForLocale" + locale
    nt = NotifierTwitter()
    notificationTableManager = NotificationTableManager()
    clusterManager = ClusterManager()

    if not nt.doesLocaleExist(locale):
        logging.info("No twitter handle exists for locale %s. %s", locale,
                     jobId)
        return  #skip

    if nt.isNightTime(locale):
        logging.info("Night time for locale %s. %s", locale, jobId)
        return  #skip

    logging.info("Fetching notifiable clusters for locale %s. %s", locale,
                 jobId)
    clusters = clusterManager.getNotfiableClustersForLocale(jobId, locale)
    logging.info("Fetched notifiable clusters for locale %s. %s", locale,
                 jobId)
    clustersToNotify = [cluster for cluster in clusters if \
        not notificationTableManager.isClusterNotified(cluster, Notifier.twitter)]
    logging.info("Number of unnotified clusters are: %i. %s",
                 len(clustersToNotify), jobId)

    for cluster in clustersToNotify[:2]:
        cluster = clusterManager.getProcessedCluster(cluster)
        try:
            nt.notifyForLocales(jobId, cluster)
            notificationTableManager.setClusterNotified(
                cluster, Notifier.twitter)
        except:
            logging.exception('Failed to tweet story for cluster %s. %s',
                              cluster.id, jobId)
Ejemplo n.º 2
0
def getCandidateDocsThroughClusters(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    for cluster in clusters:
        if len(cluster) > 1:
            closeDocs = []
            for doc in cluster:
                closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc)
            closeDocs = list(set(closeDocs))

            for (doc1, doc2) in itertools.product(cluster, closeDocs):
                try:
                    _tryGetDocDistance(distances, doc1, doc2)
                    logging.info("Docs %s and %s already compared. %s", doc1,
                                 doc2, jobInfo)
                except KeyError:
                    if doc1 != doc2:
                        job = WorkerJob(
                            JOB_COMPAREDOCS, {
                                JOBARG_COMPAREDOCS_DOC1ID: doc1,
                                JOBARG_COMPAREDOCS_DOC2ID: doc2
                            })
                        jobManager.enqueueJob(job)
                        logging.info(
                            "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s",
                            job.jobId, doc1, doc2, jobInfo)
Ejemplo n.º 3
0
def cleanUpDistanceTable(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    docList = list(clusterManager.getCurrentDocs())
    distances = list(distanceTableManager.getEntries())

    staleDocs = []
    for entry in distances:
        staleDoc = ""
        if entry[0] not in docList:
            staleDocs.append(entry[0])
        elif entry[1] not in docList:
            staleDocs.append(entry[1])
    staleDocs = list(set(staleDocs))

    for docKey in staleDocs:
        job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey})
        jobManager.enqueueJob(job)
        logging.info("Put cleanup doc job with id %s for docId: %s. %s",
                     job.jobId, docKey, jobInfo)

    logging.info("Number of stale entries in distances table: %i. %s",
                 len(staleDocs), jobInfo)
Ejemplo n.º 4
0
def processNewCluster(jobId, clusterDocs):
    cluster = Cluster(clusterDocs)
    clusterAndJobId = "Cluster id: " + cluster.id + ". Job id: " + jobId
    logger.info("Started processing new cluster. %s.", clusterAndJobId)

    clusterManager = ClusterManager()
    clusterManager.processNewCluster(cluster)

    logger.info("Completed processing new cluster. %s.", clusterAndJobId)
Ejemplo n.º 5
0
def get_story(docId):
    filters = validateFilters(request.args)

    clusterManager = ClusterManager()
    cluster = clusterManager.queryByDocId(docId.upper(), filters)
    if not cluster:
        abort(404)
    else:
        return getJsonResponse(cluster)
Ejemplo n.º 6
0
def clusterDocs(jobId):
    jobInfo = "Job id: " + jobId
    logger.info("Started clustering docs. %s.", jobInfo)

    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    logger.info("Started clustering. %s.", jobInfo)
    clusterHierarchical(jobInfo, clusters, distances)
    logger.info("Finished clustering. %s.", jobInfo)

    clusterManager.putCurrentClusters(clusters)
    logger.info("Put the computed clusters. %s.", jobInfo)
Ejemplo n.º 7
0
def loadStoryPage(docId):
    clusterManager = ClusterManager()
    cluster = clusterManager.queryByDocId(docId.upper())

    if not cluster:
        return redirect(url_for('home'))

    imagesByArticle = [article['images'] for article in cluster['articles']]
    allImages = [
        image for articleImages in imagesByArticle for image in articleImages
    ]
    image = allImages[0] if allImages else None

    return render_template('home.html',
                           title=cluster['title'],
                           description=cluster['description'],
                           locationsMetadata=json.dumps(LOCATION_METADATA),
                           languagesMetadata=json.dumps(AVAILABLE_LANGUAGES),
                           metaTags=getSharingMetaTags(cluster['title'],
                                                       cluster['description'],
                                                       image, 'article'))
Ejemplo n.º 8
0
def archiveStaleDocs():
  """
  Remove the docs fro current working set
  Run this job periodically.
  """

  clusterManager = ClusterManager()
  jobManager = MinerJobManager()

  logging.info("Archiving old clusters.")
  staleClusters = clusterManager.archiveOldClusters()

  for cluster in staleClusters:
    for docKey in cluster:
      job = WorkerJob(JOB_CLEANUPDOC, { JOBARG_CLEANUPDOC_DOCID : docKey})
      jobManager.enqueueJob(job)
      logging.info(
        "Put cleanup doc job for docId: %s. Job id: %s",
        docKey,
        job.jobId)

  logging.info("Archived old clusters and cleaned up docs in them from working set.")
Ejemplo n.º 9
0
def get_stories():
    countryFilter = request.args.get('country')
    categoryFilter = request.args.get('category')
    localeFilter = request.args.get('locale')
    (skip, top) = validateSkipAndTop(request.args.get('skip'),
                                     request.args.get('top'))
    filters = validateFilters(request.args)

    clusterManager = ClusterManager()
    if localeFilter and not (countryFilter or categoryFilter):
        localeFilter = validateLocale(localeFilter)
        return getJsonResponse(
            clusterManager.queryByLocale(localeFilter, skip, top, filters))
    elif (countryFilter and categoryFilter) and not localeFilter:
        countryFilter = validateCountry(countryFilter)
        categoryFilter = validateCategory(categoryFilter)
        return getJsonResponse(
            clusterManager.queryByCategoryAndCountry(categoryFilter,
                                                     countryFilter, skip, top,
                                                     filters))

    abort(400, "Invalid query")
Ejemplo n.º 10
0
def reprocessCurrentClusters():
  clusterManager = ClusterManager()

  logging.info("Started Reprocessing current clusters.")
  staleClusters = clusterManager.reprocessCurrentClusters()
  logging.info("Completed Reprocessing current clusters")