def similarityWithUSE(messages, countries):
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    similarity_message_encodings = embed(similarity_input_placeholder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        message_embeddings_ = session.run(
            similarity_message_encodings,
            feed_dict={similarity_input_placeholder: messages})

        corr = np.inner(message_embeddings_, message_embeddings_)
        print("correlations")

        for i in range(0, len(countries)):
            print(countries[i], corr[i])
        plots.heatmap(countries, countries, corr)
Ejemplo n.º 2
0
def plot():
    import matplotlib.pyplot as plt

    similarity_matrix = np.load("similarity.npy")

    with np.nditer(similarity_matrix,
                   flags=["multi_index"],
                   op_flags=["readwrite"]) as it:
        for value in it:
            row, column = it.multi_index
            mirror_value = similarity_matrix[column, row]

            if not value and mirror_value:
                similarity_matrix[it.multi_index] = mirror_value

    similarity_matrix = similarity_matrix[:100, :100]
    row_labels = list(range(similarity_matrix.shape[0]))
    column_labels = list(range(similarity_matrix.shape[1]))
    cmap = ListedColormap(COLOR_MAP)
    plots.heatmap(similarity_matrix, row_labels, column_labels, plt, cmap=cmap)
    plt.gcf().set_size_inches(100, 100)
    plt.legend()
    plt.savefig("test1.svg")
    plt.show()
Ejemplo n.º 3
0
def print_cluster_data(centers, labels, true_labels):
    row = len(centers)
    center_labels = []
    for i in range(row):
        center_labels.append('Center %i' % i)
    col = len(np.unique(true_labels))
    class_labels = []
    for i in range(col):
        class_labels.append('Class %i' % np.unique(true_labels)[i])

    centermap = np.zeros((row, col))
    for i in range(row):
        for j in range(col):
            idx1 = np.where(labels == i)[0]
            idx2 = np.where(true_labels == np.unique(true_labels)[j])[0]
            idx = np.intersect1d(idx1, idx2)
            centermap[i, j] = len(idx)
    # Plot the heatmap
    fig, ax = plt.subplots()
    im = plots.heatmap(centermap, center_labels, class_labels, ax=ax)
    texts = plots.annotate_heatmap(im, valfmt="{x: .0f}")

    fig.tight_layout()
    plt.show()
Ejemplo n.º 4
0
                        save=True,
                        savepath='.\\png\\plots\\histogram\\' + datasetname +
                        '.png',
                        close=True)

        plots.boxplot(df,
                      save=True,
                      savepath='.\\png\\plots\\boxplot\\' + datasetname +
                      '.png',
                      close=True)

        plots.scattermatrix(df,
                            save=True,
                            savepath='.\\png\\plots\\scattermatrix\\' +
                            datasetname + '.png',
                            close=True)

        plots.heatmap(df,
                      save=True,
                      savepath='.\\png\\plots\\heatmap\\' + datasetname +
                      '.png',
                      close=True)

        plots.probplot(df,
                       save=True,
                       savepath='.\\png\\plots\\probplot\\' + datasetname +
                       '.png',
                       close=True)

    plt.show()
Ejemplo n.º 5
0
def eda(filepath: str,
        features=None,
        targets=None,
        removeOutliers: bool = False,
        datasetname: str = ''):

    # load the data
    df = pk.load(open(filepath, 'rb'))

    # process inputs
    # TODO: properly infer if features or targets are a sequence or a single string
    if features is None:
        features = list(set(df.columns) - set(targets))

    # examine the data
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Shape of dataset:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Number of Rows: {1}'.format('    ', df.shape[0]))
    print('{0}Number of Columns: {1}'.format('    ', df.shape[1]))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Column names:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    for col in df.columns:
        print('{0}{1}'.format('    ', col))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}First 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.head(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Last 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.tail(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Statistical Summary:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.describe())
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # infer data types of the input DataFrame
    # ----------------------------------------------------------------------
    colNumeric = dfutl.numericColumns(df)

    # ----------------------------------------------------------------------
    # mean centering and scaling: standardize or normalize
    # ----------------------------------------------------------------------
    dfNumeric = df.loc[:, colNumeric]
    df.loc[:, colNumeric] = (dfNumeric - dfNumeric.mean()) / dfNumeric.std()
    dfNumeric = df.loc[:, colNumeric]

    # ----------------------------------------------------------------------
    # outlier detection
    # ----------------------------------------------------------------------
    # use z-score filtering
    # samples that are more than 3 standard deviations away from mean are to be discarded
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Outlier Detection:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    numouttotal = 0
    numout = 1
    passNum = 0
    while (numout > 0):
        # determine the number of outliers using zscore
        zscores = stats.zscore(dfNumeric)
        idx = np.logical_not(np.logical_or(zscores < -3, zscores > 3))
        idxrows = np.all(idx, axis=1)
        idxrowsout = np.logical_not(idxrows)
        numout = len(idxrows) - len(idxrows[idxrows])

        print('{0}Pass {1} detected {2} outliers'.format(
            '    ', passNum, numout))
        if not removeOutliers:
            break

        # remove outliers and contineu
        if (numout > 0 and removeOutliers):
            df = df.loc[idxrows, :]
            dfNumeric = df.loc[:, colNumeric]

        numouttotal = numouttotal + numout
        passNum = passNum + 1
    if removeOutliers:
        print('{0}Total number of outliers: {1}'.format('    ', numouttotal))
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # visualization
    # ----------------------------------------------------------------------
    plt.close('all')

    save = True
    if len(datasetname) > 0:
        savepath = '.\\png\\{0}\\eda\\'.format(datasetname)
        isdir = os.path.isdir(savepath)
        if not isdir:
            os.makedirs(savepath)
    else:
        savepath = '.\\png\\'

    plots.boxplot(dfNumeric, save=save, savepath=savepath)
    plots.histogram(df, tightLayout=True, save=save, savepath=savepath)
    plots.scattermatrix(dfNumeric, save=save, savepath=savepath)
    plots.heatmap(dfNumeric, correlation=0.5, save=save, savepath=savepath)

    #plt.show()
    plt.close('all')

    return df
Ejemplo n.º 6
0
plots.pairplot(stock_returns, 'stock_returns')


# Distplot of MS Return in 2015
plots.distplot(returns_2015['MS Return'], bins=100, name='return_2015')


# Distplot of C Return in 2008
plots.distplot(returns_C_2008, bins=100, name='returns_C_2008.png')


# Lineplot for stock values
plots.multi_lineplot(stock_close_reset, tickers=tickers)


# Moving Averages
plots.moving_average(stock_close_bac, key='BAC')


# Heatmap and Clustermap with Close Price correlation
plots.heatmap(close_price_corr)
plots.clustermap(close_price_corr)


# Create a candlestick plot for Bank of America
plots.candlestick(bac_2015, 'bac_2015')


# Create a candlestick plot with Bollinger Band
plots.candlestick_boll(bac_2015, 'bac_2015_boll')
Ejemplo n.º 7
0
    plots.stemleaf(df
        ,title = 'Stem and Leaf'
        ,save = True
        ,savepath = '.\\visual\\iris_stemleaf.txt')

    plots.histogram(df
        ,save = True
        ,savepath = '.\\visual\\iris_histogram.png'
        ,close = True)

    plots.boxplot(df
        ,save = True
        ,savepath = '.\\visual\\iris_boxplot.png'
        ,close = True)

    plots.scattermatrix(df
        ,save = True
        ,savepath = '.\\visual\\iris_scattermatrix.png'
        ,close = True)

    plots.heatmap(df
        ,save = True
        ,savepath = '.\\visual\\iris_heatmap.png'
        ,close = True)

    plots.probplot(df
        ,save = True
        ,savepath = '.\\visual\\iris_probplot.png'
        ,close = True)