def getCandidateDocsThroughClusters(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    for cluster in clusters:
        if len(cluster) > 1:
            closeDocs = []
            for doc in cluster:
                closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc)
            closeDocs = list(set(closeDocs))

            for (doc1, doc2) in itertools.product(cluster, closeDocs):
                try:
                    _tryGetDocDistance(distances, doc1, doc2)
                    logging.info("Docs %s and %s already compared. %s", doc1,
                                 doc2, jobInfo)
                except KeyError:
                    if doc1 != doc2:
                        job = WorkerJob(
                            JOB_COMPAREDOCS, {
                                JOBARG_COMPAREDOCS_DOC1ID: doc1,
                                JOBARG_COMPAREDOCS_DOC2ID: doc2
                            })
                        jobManager.enqueueJob(job)
                        logging.info(
                            "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s",
                            job.jobId, doc1, doc2, jobInfo)
def clusterDocs(jobId):
    jobInfo = "Job id: " + jobId
    logger.info("Started clustering docs. %s.", jobInfo)

    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    logger.info("Started clustering. %s.", jobInfo)
    clusterHierarchical(jobInfo, clusters, distances)
    logger.info("Finished clustering. %s.", jobInfo)

    clusterManager.putCurrentClusters(clusters)
    logger.info("Put the computed clusters. %s.", jobInfo)