Exemple #1
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)
    if args.opinion:
        corpus = corpusutil.create(args.opinion)
    else:
        corpus = cPickle.load(args.corpus)
    logger.debug("Number of documents in corpus: %d", len(corpus))
    logger.debug("Going to Create subsets")
    subset = corpusutil.generate_subset(corpus,doctype = args.doctype, product=\
                                        args.product, version = args.version,\
                                        platform = args.platform, locale =\
                                        args.locale, manufacturer =\
                                        args.manufacturer, device = args.device)
    logger.debug("Number of documents in subset: %d", len(subset))
    cPickle.dump(subset,open('subset'+'-'+sessionid+'.pck','w'))
Exemple #2
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.opinion:
        corpus = corpusutil.create(args.opinion)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        if args.stopwords:
            stopwords = args.stopwords.read().split()
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        else:
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = None)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    elif args.corpus:
        corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        if args.stopwords:
            stopwords = args.stopwords.read().split()
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        else:
            datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = None)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()
    data = result['data']
    p = data.shape[0]
    n = data.shape[1]
    logger.debug(" Vectors are of dimensions: (%d,%d)",\
                 p, n)
    if args.saveint:
        cPickle.dump(docids,open("tfidfvectors_key_"+sessionid+'.pck','w'))
        spio.mmwrite(open("tfidfvectors_"+sessionid+".mtx",'w')\
                     ,data,comment="CSC Matrix",field = 'real')
    #DEFAULT_RANK chosen because it works well in practice. 
    DEFAULT_RANK = 250
    r = args.r
    maxr = min(p,n)
    logger.debug(" Data can have rank not greate than : %d", maxr)
    if maxr >= DEFAULT_RANK:
        if DEFAULT_RANK > r or r > maxr:
            r = DEFAULT_RANK
    else:
        r = int(maxr/2)
    logger.debug(" Going to generate rank %d approximation", r)
    ut,s,vt = sparsesvd(data,r)
    red_data = ssp.csc_matrix(np.dot(ut.T,np.dot(np.diag(s),vt)))
    logger.debug(" Generated rank %d approximation", r)
    if normalize:
        logger.debug(" Normalizing columns of reduced rank matrix...")
        invnorms = np.zeros(n)
        normsii = np.arange(0,n,1)
        normsjj = np.arange(0,n,1)
        for col in range(n):
            invnorms[col] = math.sqrt((red_data[:,col].T*red_data[:,col]).todense())
            if invnorms[col] is not 0:
                invnorms[col] = 1/invnorms[col]
        diag = ssp.coo_matrix((invnorms,(normsii,normsjj)),shape = (n,n)).tocsc()
        red_data = red_data*diag
    logger.debug(" Doing KMeans on reduced rank matrix...")
    kmeans = corpusutil.KMeans(data = red_data,k = args.k,n = args.n, delta =\
                               args.delta,randomcentroids =\
                               args.randomcentroids, verbose =
                               args.verbose,classical = args.classical)
    result = kmeans.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    if args.saveint:
        cPickle.dump(clusters,open("redrank_clusters_"+sessionid+'.pck','w'))
        spio.mmwrite(open("redrank_centroids_"+sessionid+'.mtx','w'),centroids,\
                     comment="CSC Matrix", field = 'real')
    logger.info(" %d Clusters Generated ",len(clusters))
    result = corpusutil.getcentroids(data,clusters)
    originalmat_centroids = result['centroids']
    originalmat_centroiddict = result['centroiddict']
    if args.saveint:
        spio.mmwrite(open("originalmat_centroids_"+sessionid+'.mtx','w'),\
                     originalmat_centroids,comment="CSC Matrix", field = 'real')
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    svdkmeansvis = open("svdkmeans-concept_clouds_"+str(sessionid)+'.html','w')
    svdkmeansvis.write(vis_output)
    svdkmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(originalmat_centroids.todense(),\
                                             originalmat_centroiddict,\
                                             featuredict,sessionid)
    svdkmeansvis = open("svdkmeans-feature_clusters_"+str(sessionid)+'.html','w')
    svdkmeansvis.write(vis_output)
    svdkmeansvis.close()
Exemple #3
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.stopwords:
        stopwords = args.stopwords.read().split()
    else:
        stopwords = None
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.opinion or args.corpus:
        if args.opinion:
            corpus = corpusutil.create(args.opinion)
        else:
            corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()
    X = result['data']
    if args.k is None:
        MIN_K = 2
        MAX_K = 50
        SAMPLE_SIZE_PERCENT = 100
        spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma =\
                                     args.sigma, n = args.n, delta =\
                                     args.delta, MIN_K = MIN_K, MAX_K = MAX_K,\
                                     SAMPLE_SIZE_PERCENT = SAMPLE_SIZE_PERCENT,\
                                     randomcentroids = args.randomcentroids,\
                                     classical = args.classical, verbose = \
                                     args.verbose)
    else:
        spectral = SpectralClusterer(X = X, usecosine = args.usecosine, sigma = \
                                     args.sigma, k = args.k, n = args.n, delta = \
                                     args.delta, randomcentroids = \
                                     args.randomcentroids, classical =\
                                     args.classical, verbose =\
                                     args.verbose)
    clusters = spectral.run()
    result = corpusutil.getcentroids(X, clusters, normalize)
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    logger.info(" %d Clusters Generated ",len(clusters))
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    kmeansvis = open("Spectral-concept_clouds_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\
                                                     featuredict,sessionid)
    kmeansvis = open("Spectral-feature_clusters_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()
Exemple #4
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger = logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.stopwords:
        stopwords = args.stopwords.read().split()
    else:
        stopwords = None
    if args.opinion:
        corpus = corpusutil.create(args.opinion)
    else:
        corpus = cPickle.load(args.corpus)
    spectralcc = corpusutil.SpectralCoClusterer(
        corpus=corpus,
        mindfpercent=args.mindfpercent,
        maxdfpercent=args.maxdfpercent,
        minfrequency=args.minfrequency,
        verbose=args.verbose,
        usebigrams=args.usebigrams,
        tf=args.tf,
        stopwords=stopwords,
        k=args.k,
        n=args.n,
        delta=args.delta,
        randomcentroids=args.randomcentroids,
        sessionid=sessionid,
        classical=args.classical,
    )

    result = spectralcc.run()
    fclouds = result["fclouds"]
    docclouds = result["docclouds"]
    A = result["A"]
    An = result["An"]
    Z = result["Z"]
    spio.mmwrite(
        open("A_" + sessionid + ".mtx", "w"),
        A,
        comment="CSC\
                     Matrix",
        field="real",
    )
    if args.saveint:
        spio.mmwrite(
            open("An_" + sessionid + ".mtx", "w"),
            An,
            comment="CSC\
                     Matrix",
            field="real",
        )
        spio.mmwrite(
            open("Z_" + sessionid + ".mtx", "w"),
            Z,
            comment="CSC\
                     Matrix",
            field="real",
        )
    kmeansvis = open("spectral-concept_clouds_" + str(sessionid) + ".html", "w")
    kmeansvis.write(docclouds)
    kmeansvis.close()
    kmeansvis = open("spectral-features_clusters_" + str(sessionid) + ".txt", "w")
    kmeansvis.write(fclouds)
    kmeansvis.close()
Exemple #5
0
def main():
    parser = gen_args()
    args = parser.parse_args()
    sessionid = args.sessionid
    logger =  logging.getLogger(__name__)
    logger.addHandler(logging.StreamHandler())
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.classical:
        normalize = True
    else:
        normalize = False
    if args.stopwords is None:
        stopwords = None
    else:
        stopwords = args.stopwords.read().split()
    if args.opinion or args.corpus:
        if args.opinion:
            corpus = corpusutil.create(args.opinion)
        else:
            corpus = cPickle.load(args.corpus)
        logger.debug("Number of documents in corpus: %d ", len(corpus))
        datacreator = corpusutil.GenerateVectors(corpus = corpus, mindfpercent\
                                                 = args.mindfpercent,\
                                                 maxdfpercent =\
                                                 args.maxdfpercent,\
                                                 minfrequency =\
                                                 args.minfrequency,\
                                                 verbose = args.verbose,\
                                                 usebigrams = args.usebigrams,\
                                                 normalize = normalize,\
                                                 tf = args.tf,\
                                                 stopwords = stopwords)
        result = datacreator.create()
        docids = result['docids']
        featuredict = result['featuredict']
    else:
        index = cPickle.load(args.indexstuff[0])
        featuredict = cPickle.load(args.indexstuff[1])
        docids = cPickle.load(args.indexstuff[2])
        datacreator = corpusutil.GenerateVectors(index = index, featuredict =\
                                             featuredict, docids = docids,\
                                                ndocs_content = ndocs_content,\
                                                normalize = normalize,\
                                                tf = args.tf)
        result = datacreator.create()

    data = result['data']
    if args.k is None:
        SAMPLE_SIZE_PERCENT = 50
        MIN_K = 2
        MAX_K = 50
        logger.debug('k not set, finding k using sample size:\
                     %f',SAMPLE_SIZE_PERCENT)
        k = corpusutil.find_no_clusters(X = data, samplesize =\
                                        SAMPLE_SIZE_PERCENT, mink = MIN_K, maxk\
                                        = MAX_K, verbose = args.verbose,\
                                       classical = args.classical)
    else:
        k = args.k
    if args.saveint:
        cPickle.dump(docids,open("data_key_"+sessionid+'.pck','w'))
        spio.mmwrite(open("data_"+sessionid+".mtx",'w')\
                     ,data,comment="CSC Matrix",field = 'real')
    kmeans = corpusutil.KMeans(data = data,k = k,n = args.n, delta =\
                               args.delta,randomcentroids =\
                               args.randomcentroids, verbose =\
                               args.verbose,classical = args.classical)
    result = kmeans.run()
    clusters = result['clusters']
    centroids = result['centroids']
    centroiddict = result['centroiddict']
    if args.saveint:
        cPickle.dump(clusters,open("data_clusters_"+sessionid+'.pck','w'))
        spio.mmwrite(open("data_centroids_"+sessionid+'.mtx','w'),centroids,\
                     comment="CSC Matrix", field = 'real')
    logger.info(" %d Clusters Generated ",len(clusters))
    vis_output = corpusutil.genconceptclouds(centroids = centroids,\
                                             centroiddict = centroiddict,\
                                             featuredict = featuredict,\
                                             corpus = corpus,\
                                             clusters = clusters,\
                                             docids = docids,\
                                             sessionid = sessionid)
    kmeansvis = open("kmeans-concept_clouds_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()
    vis_output = corpusutil.genfeatureclouds(centroids.todense(),centroiddict,\
                                                     featuredict,sessionid)
    kmeansvis = open("kmeans-feature_clusters_"+str(sessionid)+'.html','w')
    kmeansvis.write(vis_output)
    kmeansvis.close()