Example #1
0
def main_cluster(rootpath, folderpath, vectorizer, numclusters):
    """Get main function for getCluster."""
    folderPath = os.path.join(folderpath, 'final')
    vectorizer = vectorizer
    numClusters = numclusters
    preprocessor = Utility.PreprocessData(rootpath)
    gc = Clustering.GetCluster(vectorizer, rootpath)
    # get the kmeans model
    print("Getting the k-means model...")
    startTime = time.time()
    km = gc.getKmeans(folderPath, numClusters)
    print("---------- K-means: {} seconds ----------".format(time.time() -
                                                             startTime))
    # get the doc2Label
    print("Getting doc to label...")
    gc.getDoc2Label(folderPath, km)
    # get Label2Doc
    print("Getting label to doc...")
    gc.getLabel2Doc(folderPath, km)
    # get tweets.pkl for each clusters
    print("Storing tweets for clusters...")
    preprocessor.storeTweets4Clusters(folderPath)
Example #2
0
    def getCluster(self, vectorizer, numclusters):
        """Get main function for getCluster.

        Parameters
        ----------
        vectorizer : str
            the vectorizer used in addressing word2vec
            options: 'mean', 'tfidf'
        numclusters : int
            the number of clusters

        Returns
        -------
        None

        """
        folderPath = os.path.join(self.folderpath, 'final')
        vectorizer = vectorizer
        numClusters = numclusters
        preprocessor = Utility.PreprocessData(self.rootpath)
        gc = Clustering.GetCluster(vectorizer, self.rootpath)
        # get the kmeans model
        print("Getting the k-means model...")
        startTime = time.time()
        km = gc.getKmeans(folderPath, numClusters)
        print("---------- K-means: {} seconds ----------".format(time.time() -
                                                                 startTime))
        # get the doc2Label
        print("Getting doc to label...")
        gc.getDoc2Label(folderPath, km)
        # get Label2Doc
        print("Getting label to doc...")
        gc.getLabel2Doc(folderPath, km)
        # get tweets.pkl for each clusters
        print("Storing tweets for clusters...")
        preprocessor.storeTweets4Clusters(folderPath)