Ejemplo n.º 1
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.opinion:
        corpus = corpusutil.create(args.opinion)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        if args.stopwords:
            stopwords = args.stopwords.read().split()
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        else:
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = None)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    elif args.corpus:
        corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        if args.stopwords:
            stopwords = args.stopwords.read().split()
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        else:
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = None)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()
    data = result['data']
    p = data.shape[0]
    n = data.shape[1]
    logger.debug(" Vectors are of dimensions: (%d,%d)",\
                 p, n)
    if args.saveint:
        cPickle.dump(docids,open("tfidfvectors_key_"+sessionid+'.pck','w'))
        spio.mmwrite(open("tfidfvectors_"+sessionid+".mtx",'w')\
                     ,data,comment="CSC Matrix",field = 'real')
    #DEFAULT_RANK chosen because it works well in practice. 
    DEFAULT_RANK = 250
    r = args.r
    maxr = min(p,n)
    logger.debug(" Data can have rank not greate than : %d", maxr)
    if maxr >= DEFAULT_RANK:
        if DEFAULT_RANK > r or r > maxr:
            r = DEFAULT_RANK
    else:
        r = int(maxr/2)
    logger.debug(" Going to generate rank %d approximation", r)
    ut,s,vt = sparsesvd(data,r)
    red_data = ssp.csc_matrix(np.dot(ut.T,np.dot(np.diag(s),vt)))
    logger.debug(" Generated rank %d approximation", r)
    if normalize:
        logger.debug(" Normalizing columns of reduced rank matrix...")
        invnorms = np.zeros(n)
        normsii = np.arange(0,n,1)
        normsjj = np.arange(0,n,1)
        for col in range(n):
            invnorms[col] = math.sqrt((red_data[:,col].T*red_data[:,col]).todense())
            if invnorms[col] is not 0:
                invnorms[col] = 1/invnorms[col]
        diag = ssp.coo_matrix((invnorms,(normsii,normsjj)),shape = (n,n)).tocsc()
        red_data = red_data*diag
    logger.debug(" Doing KMeans on reduced rank matrix...")
    kmeans = corpusutil.KMeans(data = red_data,k = args.k,n = args.n, delta =\
                               args.delta,randomcentroids =\
                               args.randomcentroids, verbose =
                               args.verbose,classical = args.classical)
    result = kmeans.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    if args.saveint:
        cPickle.dump(clusters,open("redrank_clusters_"+sessionid+'.pck','w'))
        spio.mmwrite(open("redrank_centroids_"+sessionid+'.mtx','w'),centroids,\
                     comment="CSC Matrix", field = 'real')
    logger.info(" %d Clusters Generated ",len(clusters))
    result = corpusutil.getcentroids(data,clusters)
    originalmat_centroids = result['centroids']
    originalmat_centroiddict = result['centroiddict']
    if args.saveint:
        spio.mmwrite(open("originalmat_centroids_"+sessionid+'.mtx','w'),\
                     originalmat_centroids,comment="CSC Matrix", field = 'real')
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    svdkmeansvis = open("svdkmeans-concept_clouds_"+str(sessionid)+'.html','w')
    svdkmeansvis.write(vis_output)
    svdkmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(originalmat_centroids.todense(),\
                                             originalmat_centroiddict,\
                                             featuredict,sessionid)
    svdkmeansvis = open("svdkmeans-feature_clusters_"+str(sessionid)+'.html','w')
    svdkmeansvis.write(vis_output)
    svdkmeansvis.close()
Ejemplo n.º 2
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.stopwords:
        stopwords = args.stopwords.read().split()
    else:
        stopwords = None
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.opinion or args.corpus:
        if args.opinion:
            corpus = corpusutil.create(args.opinion)
        else:
            corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()
    X = result['data']
    if args.k is None:
        MIN_K = 2
        MAX_K = 50
        SAMPLE_SIZE_PERCENT = 100
        spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma =\
                                     args.sigma, n = args.n, delta =\
                                     args.delta, MIN_K = MIN_K, MAX_K = MAX_K,\
                                     SAMPLE_SIZE_PERCENT = SAMPLE_SIZE_PERCENT,\
                                     randomcentroids = args.randomcentroids,\
                                     classical = args.classical, verbose = \
                                     args.verbose)
    else:
        spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma = \
                                     args.sigma, k = args.k, n = args.n, delta = \
                                     args.delta, randomcentroids = \
                                     args.randomcentroids, classical =\
                                     args.classical, verbose =\
                                     args.verbose)
    clusters = spectral.run()
    result = corpusutil.getcentroids(X, clusters, normalize)
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    logger.info(" %d Clusters Generated ",len(clusters))
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    kmeansvis = open("Spectral-concept_clouds_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\
                                                     featuredict,sessionid)
    kmeansvis = open("Spectral-feature_clusters_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()