Ejemplo n.º 1
0
 def display_topics(model, feature_names, no_top_words):
     for topic_idx, topic in enumerate(model.components_):
         du.getLogger().debug("Topic %d:" % (topic_idx))
         du.getLogger().debug(" ".join([
             feature_names[i]
             for i in topic.argsort()[:-no_top_words - 1:-1]
         ]))
Ejemplo n.º 2
0
def subsetByCategory(X, y, targetIn):
    du.getLogger().debug("Original Set\n", X.__len__)
    n = 0
    rows = list()
    for target in y:
        if target == targetIn:
            rows.append(n)
        n += 1
    array = np.asarray(rows)
    out = X[array, :]
    du.getLogger().debug("\n\nReturning SubSet\n" + str(out.__len__))
    dataloader.infoX(out)
    return (out)
Ejemplo n.º 3
0
def saveListAsExcel(list,
                    outputDirectory,
                    fileName,
                    outputColumnNames,
                    targeName=""):
    outputDataframe = pd.DataFrame(list, columns=outputColumnNames)
    # Here we use a python function to convert our dataframe to Excel and then save it
    writer = ExcelWriter(outputDirectory + "/" + du.timeStamped() + targeName +
                         "_" + fileName + '.xlsx')
    outputDataframe.to_excel(writer, '_Sheet1')
    du.getLogger().debug("Saving to " + outputDirectory + "/" +
                         du.timeStamped() + targeName + "_" + fileName +
                         '.xlsx')
    writer.save()
Ejemplo n.º 4
0
def infoX(X):
    logger = datautils.getLogger()
    logger.info("X.format" + str(X.format))
    logger.info("X.dtype" + str(X.dtype))
    logger.info("len(X.indices)" + str(len(X.indices)))
    logger.info("X.ndim" + str(X.ndim))
    logger.info("X.__len__" + str(X.__len__))
    logger.info("X[:, 0].shape" + str(X[:, 0].shape))
def convertToGensimCorporaAndDictionary(X, columnMap):
    dct = gensim.corpora.Dictionary()

    datautils.getLogger().info("\n convertToGensimCorporaAndDictionary \n\n")

    cx = scipy.sparse.coo_matrix(X)

    corpora = []
    doc = []
    currentRow = 0
    for i, j, v in zip(cx.row, cx.col, cx.data):
        if (i > currentRow):
            print("-> ")
            corpora.append(doc)
            doc = []
            currentRow = i
        # print("(%d, %d), %s" % (i,j,v))
        for x in range(0, int(v)):
            doc.append(datautils.getTermByIdx(columnMap, j + 1))

    dct.add_documents(corpora)
    common_corpus = [dct.doc2bow(text) for text in corpora]
    return (common_corpus, dct)
Ejemplo n.º 6
0
def kmeansBySubset(X,
                   y,
                   columnMap,
                   targetMap,
                   outputDirectory,
                   dataFile,
                   clusterCount=20,
                   maxIterations=300,
                   init="k-means++",
                   n_init="10",
                   precompute_distances='auto',
                   algorithm='auto',
                   verbose=1,
                   n_jobs=1,
                   thresholdForReporting=0.05):
    for target in np.unique(y):
        targetName=du.getTargetByIdx(targetMap,target)
        du.getLogger().debug("\n\nSubset for "+str(targetName))
        subset = dataprocessing.subsetByCategory(X,y,target)
        dataloader.infoX(subset)

        kmeansplots.sparsityPlot(subset,targetName)

        kmeansForSubset=doKmeans(subset,
                                 clusterCount,
                                 maxIterations,
                                 init,
                                 n_init,
                                 precompute_distances,
                                 algorithm,
                                 verbose,
                                 n_jobs)

        reportTuple = filterAndReportResults(subset, columnMap, dataFile, thresholdForReporting)
        dataprocessing.saveListAsExcel(reportTuple[0], outputDirectory, dataFile, reportTuple[1])

        kmeansplots.plotClusterCentroids(subset,kmeansForSubset,targetName)
Ejemplo n.º 7
0
def filterAndReportResultsLDA(model, cmap, n_top_words=10):

    listOfWordsByTopic = []

    for topic, comp in enumerate(model.components_):
        du.getLogger().debug("topic " + str(topic))
        du.getLogger().debug("comp " + str(comp))

        word_idx = np.argsort(comp)[::-1][:n_top_words]
        du.getLogger().debug(str(topic) + "word_idx" + str(word_idx))

        for i in word_idx:
            listOfWordsByTopic.append(
                [topic, du.getTermByIdx(cmap, (i + 1)), comp[i]])

    for i, (topic, term, value) in enumerate(listOfWordsByTopic):
        du.log().debug("topic " + str(topic) + " term " + str(term) +
                       " value " + str(value))

    outputColumnNames = ["topic", "term", "lda_weight"]

    return ([listOfWordsByTopic, outputColumnNames])
Ejemplo n.º 8
0
def filterAndReportResults(kmeans,columnMap, target,thresholdForReporting=0.001):

    du.getLogger().debug(kmeans.cluster_centers_)
    outputColumnNames = ['Target','Cluster','Weight', 'Term']

    #This is the list of analysis results. We will add to it and then save as an Excel Spreadsheet
    listOfRows = []
    #columnsToRemove
    #This is a set of columns to remove. We will add to this list if our logic tells us to remove the column
    #NOTE a set cannot contain duplicates. This is good. We don't want to remove the same column twice!
    du.getLogger().debug(" K Means Parameter Values:")
    du.getLogger().debug(" inertia: "+str(kmeans.inertia_))
    du.getLogger().debug(" init: "+str(kmeans.init))
    du.getLogger().debug(" labels_: "+str(kmeans.labels_))
    du.getLogger().debug(" max_iter: "+str(kmeans.max_iter))
    du.getLogger().debug(" params: "+str(kmeans.get_params))
    du.getLogger().debug(" tol: "+str(kmeans.tol))
    du.getLogger().debug(" n_init: "+str(kmeans.n_init))

    clusterIdx = 0
    for row in kmeans.cluster_centers_:
        clusterIdx += 1
        du.getLogger().debug("\n------\nCluster\n\n")
        colIdx = 1
        for weight in row:
            if weight > thresholdForReporting:
                #This line simply prints the output to the console
                du.getLogger().debug(str(weight) + " col: "+ str(du.getTermByIdx(columnMap,colIdx)))
                # This adds the information about term and its weighting to a list.
                # We convert this list to a dataframe at the end of this function and save as excel
                # In other words we are going to save the results
                listOfRows.append([target,clusterIdx, weight, du.getTermByIdx(columnMap,colIdx)])



            colIdx += 1

    return ([listOfRows,outputColumnNames])
Ejemplo n.º 9
0
def print_topics(model, feature_names, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        du.getLogger().debug("Topic %d:" % (idx))
        du.getLogger().debug([(feature_names[i], round(topic[i], 2))
                              for i in topic.argsort()[:-n_top_words - 1:-1]])
Ejemplo n.º 10
0
def reportResults1(best_lda_model, X, y, columnMap, n_top_words=10):
    lda_output = best_lda_model.transform(X)
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
    docnames = ["Doc" + str(i) for i in range(len(y))]
    du.getLogger().debug(topicnames)
    du.getLogger().debug(docnames)
    import numpy as np
    import pandas as pd
    df_document_topic = pd.DataFrame(np.round(lda_output, 2),
                                     columns=topicnames,
                                     index=docnames)
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    # showing document*topic table with a column called dominant_topic
    df_document_topic.head(15)

    df_topic_distribution = df_document_topic['dominant_topic'].value_counts(
    ).reset_index(name="Num Documents")

    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    df_topic_distribution

    df_topic_keywords = pd.DataFrame(best_lda_model.components_)
    df_topic_keywords.columns = columnMap['Term']
    df_topic_keywords.index = topicnames
    df_topic_keywords.head()

    # showing topics with words, but witout weights
    def display_topics(model, feature_names, no_top_words):
        for topic_idx, topic in enumerate(model.components_):
            du.getLogger().debug("Topic %d:" % (topic_idx))
            du.getLogger().debug(" ".join([
                feature_names[i]
                for i in topic.argsort()[:-no_top_words - 1:-1]
            ]))

    du.getLogger().debug("LDA Model:")
    display_topics(best_lda_model, columnMap['Term'], 20)
    du.getLogger().debug("=" * 40)

    # showing topics with words and weights in tuples
    # https://nlpforhackers.io/topic-modeling/

    du.getLogger().debug("LDA Model:")
    print_topics(best_lda_model, columnMap['Term'], n_top_words)
    du.getLogger().debug("=" * 40)