Esempio n. 1
0
 def run(self):
     ''' Main Method that drives SpectralClusterer.
     Refer Ulrike's Tutorial on Spectral Clustering to understand notation.
     Return:
         Clusters: A dict with keys as cluster IDs and values as a list of
         vector IDs.
     '''
     self.logger.debug('Generating Degree Matrix')
     self.getD()
     self.logger.debug('Generating Unnormalized Laplacian Matrix')
     self.getL()
     self.logger.debug('Generating Normalized Laplacian Matrix')
     self.getLsym()
     self.logger.debug('Generating Eigenvectors  Matrix')
     self.getU()
     self.logger.debug('Generating Normalized Eigenvectors Matrix')
     self.getT()
     self.logger.debug('Doing KMeans')
     data = (self.T.T).tocsc()
     if self.k is None:
         self.k = corpusutil.find_no_clusters(X = data, samplesize =\
                                   self.SAMPLE_SIZE_PERCENT,mink =\
                                   self.MIN_K, maxk = self.MAX_K,\
                                   classical = self.classical,\
                                   verbose = self.verbose)
         self.logger.debug('k found to be %d',self.k)
     kmeans = corpusutil.KMeans(data = data, k = self.k, n = self.n,\
                                delta = self.delta, randomcentroids =\
                                self.randomcentroids, verbose =\
                                self.verbose, classical = self.classical)
     result = kmeans.run()
     return result['clusters']
Esempio n. 2
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.stopwords is None:
        stopwords = None
    else:
        stopwords = args.stopwords.read().split()
    if args.opinion or args.corpus:
        if args.opinion:
            corpus = corpusutil.create(args.opinion)
        else:
            corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()

    data = result['data']
    if args.k is None:
        SAMPLE_SIZE_PERCENT = 50
        MIN_K = 2
        MAX_K = 50
        logger.debug('k not set, finding k using sample size:\
                     %f',SAMPLE_SIZE_PERCENT)
        k = corpusutil.find_no_clusters(X = data, samplesize =\
                                        SAMPLE_SIZE_PERCENT, mink = MIN_K, maxk\
                                        = MAX_K, verbose = args.verbose,\
                                       classical = args.classical)
    else:
        k = args.k
    if args.saveint:
        cPickle.dump(docids,open("data_key_"+sessionid+'.pck','w'))
        spio.mmwrite(open("data_"+sessionid+".mtx",'w')\
                     ,data,comment="CSC Matrix",field = 'real')
    kmeans = corpusutil.KMeans(data = data,k = k,n = args.n, delta =\
                               args.delta,randomcentroids =\
                               args.randomcentroids, verbose =\
                               args.verbose,classical = args.classical)
    result = kmeans.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    if args.saveint:
        cPickle.dump(clusters,open("data_clusters_"+sessionid+'.pck','w'))
        spio.mmwrite(open("data_centroids_"+sessionid+'.mtx','w'),centroids,\
                     comment="CSC Matrix", field = 'real')
    logger.info(" %d Clusters Generated ",len(clusters))
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    kmeansvis = open("kmeans-concept_clouds_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\
                                                     featuredict,sessionid)
    kmeansvis = open("kmeans-feature_clusters_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()