Example #1
0
    def run(self):
        res = Response()
        params = self.params

        max_thresh = cast_int(params['max_thresh'])
        n = cast_int(params['n'])
        data = pd.DataFrame.from_dict(json.loads(params['data']))
        viz_df = pd.DataFrame.from_dict(json.loads(params['viz_df']))
        cluster_method = params['clusteringMethod']
        linkage_matrix = np.array([
            float(x) for x in params['linkage_matrix'].split(',')
        ]).reshape(n - 1, 4) if cluster_method == "hac" else None
        height = cast_int(
            params['threshold']) if cluster_method == "hac" else None
        k = cast_int(
            params['threshold']) if cluster_method == "kmeans" else None
        min_cluster_size = cast_int(params['minClusterSize'])
        topics_per_cluster = cast_int(params['topicsPerCluster'])

        # Recluster
        self.status = 'Reclustering...'
        data, cluster_df = topex.recluster(
            data,
            viz_df,
            linkage_matrix=linkage_matrix,
            cluster_method=cluster_method,
            height=height,
            k=k,
            min_cluster_size=min_cluster_size,
            topics_per_cluster=topics_per_cluster,
            show_chart=False)
        viz_df.cluster = data.cluster
        viz_df['valid'] = data.valid

        # Return
        res = Response()
        res.viz_df = viz_df.to_json()
        res.data = data[[
            'id', 'text', 'tokens', 'phrase', 'vec', 'cluster', 'valid'
        ]].to_json()  #only return the needed subset of data columns
        res.linkage_matrix = [list(row) for row in list(linkage_matrix)
                              ] if linkage_matrix is not None else []
        res.main_cluster_topics = list(cluster_df.topics)
        res.count = len(data)
        res.max_thresh = max_thresh
        res.thresh = height if cluster_method == "hac" else k

        self.result = dict(res)
        self.status = 'Complete'
Example #2
0
    def run(self):
        res = Response()
        params = self.params
        files = self.files

        # Process input from request
        self.status = 'Loading files'
        names = []
        docs = []
        for file in files:
            fileob = files[file]
            print(f"File: {fileob}")
            if fileob.content_type == 'application/json':
                scriptArgs = json.loads(fileob.stream.read())
            else:
                fileText = fileob.read().decode()
                docs.append(fileText)
                names.append(fileob.filename)
        docs = [doc.replace('\n', ' ').replace('\r', ' ') for doc in docs]
        df = pd.DataFrame(dict(doc_name=names, text=docs))

        self.status = 'Parsing params'
        stopwords = [s.strip() for s in params['stopwords'].split('\n')
                     ] if str_valid(params['stopwords']) else None
        window_size = cast_int(params['windowSize'])
        vectorization_method = params['wordVectorType'] if str_valid(
            params['wordVectorType']) else 'svd'
        dimensions = cast_int(params['dimensions'])
        tfidf_corpus = params['tfidfCorpus'] if str_valid(
            params['tfidfCorpus']) else 'both'
        include_sentiment = params['include_sentiment'] != 'false'
        custom_stopwords_only = params['custom_stopwords_only'] != 'false'

        clustering_method = params['clusteringMethod']
        cluster_dist_metric = params['cluster_dist_metric'] if str_valid(
            params['cluster_dist_metric']) else 'euclidean'
        height = cast_int(
            params['threshold']) if clustering_method == "hac" else None
        k = cast_int(
            params['threshold']) if clustering_method == "kmeans" else None

        visualization_method = params['visualizationMethod'] if str_valid(
            params['visualizationMethod']) else 'umap'
        viz_dist_metric = params['viz_dist_metric'] if str_valid(
            params['viz_dist_metric']) else 'cosine'
        umap_neighbors = cast_int(params['umap_neighbors'])

        if str_valid(params['expansionCorpus']):
            expansionCorpus = params['expansionCorpus'].rstrip("<newdoc>")
            expansion_docs = expansionCorpus.split(
                "<newdoc>") if len(expansionCorpus) > 0 else []
            expansion_names = [
                f"expansion_{i}" for i in range(len(expansion_docs))
            ]
            expansion_df = pd.DataFrame(
                dict(doc_name=expansion_names, text=expansion_docs))
        else:
            expansion_df = None
            tfidf_corpus = 'clustering'

        # Cluster the sentences in a dataframe
        self.status = 'Importing data'
        data, doc_df = topex.import_data(
            df,
            save_results=False,
            file_name=None,
            stop_words_list=stopwords,
            custom_stopwords_only=custom_stopwords_only)
        self.status = 'Creating TF-IDF'
        tfidf, dictionary = topex.create_tfidf(tfidf_corpus,
                                               doc_df,
                                               expansion_df=expansion_df)

        if dimensions is None or dimensions >= tfidf.shape[1]:
            new_dim = min(200, tfidf.shape[1] - 1)
            res.msg += f"Dimensions changed from {dimensions} to {new_dim}.\n"
            dimensions = 2 if vectorization_method == 'umap' else new_dim

        self.status = 'Getting phrases'
        data = topex.get_phrases(data,
                                 dictionary.token2id,
                                 tfidf,
                                 tfidf_corpus=tfidf_corpus,
                                 window_size=window_size,
                                 include_sentiment=include_sentiment)
        self.status = 'Vectorizing phrases'
        data = topex.get_vectors(vectorization_method,
                                 data,
                                 dictionary=dictionary,
                                 tfidf=tfidf,
                                 dimensions=dimensions,
                                 umap_neighbors=umap_neighbors)

        if clustering_method == 'kmeans' and k > len(data):
            res.msg += f"k exceeds number of sentences. Changed from {k} to {len(data)}.\n"
            k = len(data)

        self.status = 'Clustering sentences'
        data, linkage_matrix, max_thresh, thresh = topex.assign_clusters(
            data,
            method=clustering_method,
            k=k,
            height=height,
            dist_metric=cluster_dist_metric)
        self.status = 'Visualizing sentences'
        viz_df = topex.visualize_clustering(data,
                                            method=visualization_method,
                                            dist_metric=viz_dist_metric,
                                            show_chart=False,
                                            return_data=True,
                                            umap_neighbors=umap_neighbors)
        viz_df['valid'] = True
        data['valid'] = True  # Show all points on the first run
        cluster_df = topex.get_cluster_topics(data, doc_df)

        res.viz_df = viz_df.to_json()
        res.data = data[[
            'id', 'text', 'tokens', 'phrase', 'vec', 'cluster', 'valid'
        ]].to_json()  #only return the needed subset of data columns
        res.linkage_matrix = [list(row) for row in list(linkage_matrix)
                              ] if linkage_matrix is not None else []
        res.main_cluster_topics = list(cluster_df.topics)
        res.count = len(data)
        res.max_thresh = max_thresh
        res.thresh = thresh
        self.result = dict(res)
        self.status = 'Complete'