Beispiel #1
0
    def plot_links_heatmap(self):
        '''

        :return: a hashmap of path to delay array values
        '''
        links = [[0 for i in gl_mote_range] for m in gl_mote_range]

        for pkt in self.packets:
            for idx, hop in enumerate(pkt.hop_info):
                src = hop['addr']
                if idx == (len(pkt.hop_info) - 1):
                    dst = 1
                else:
                    dst = pkt.hop_info[idx + 1]['addr']
                links[src - 1][dst - 1] += 1

        plt.figure()
        heatmap(data=links,
                xticklabels=[i for i in gl_mote_range],
                yticklabels=[i for i in gl_mote_range])

        if gl_save:
            plt.savefig(gl_image_path +
                        re.findall(r"(.+?)\.log",
                                   self.filename.split('/')[-1])[0] +
                        '_link_load.png',
                        format='png',
                        bbox='tight')
Beispiel #2
0
    def plot_sig_bins(self, binwise_gc, savename='sigbins.png', show=False):
        fig = plt.figure(figsize=(12, 4))
        gs = gridspec.GridSpec(10, 15)

        ax1 = plt.subplot(gs[:4, :-1])
        f = lambda x, y: x if (y in self.sigBinIndices[0]) else 0
        y = map(f, self.chip.binnedNormSignal[0], self.binSpace)
        ax1.plot(self.binSpace, y, lw=0.5, c='darkorange')
        ax1.yaxis.set_major_locator(MaxNLocator(5))
        ax1.xaxis.set_major_locator(MaxNLocator(15))

        ax2 = plt.subplot(gs[4:8, :-1])
        f = lambda x, y: x if (y in self.sigBinIndices[1]) else 0
        y = map(f, self.chip.binnedNormSignal[1], self.binSpace)
        ax2.yaxis.set_major_locator(MaxNLocator(5))
        ax2.plot(self.binSpace, y, lw=0.5, c='darksage')
        ax2.xaxis.set_major_locator(MaxNLocator(15))

        ax1.set_ylim((0, max(ax1.get_ylim()[1], ax2.get_ylim()[1])))
        ax2.set_ylim((0, ax1.get_ylim()[1]))
        ax1.set_ylabel("ChiP RPM\n(+) strand")
        ax2.set_ylabel("ChiP RPM\n(-) strand")

        ax3 = plt.subplot(gs[6:, -1:])
        ax4 = plt.subplot(gs[8:, :-1])
        sns.heatmap([binwise_gc, binwise_gc], cmap='seismic', ax=ax4, cbar_ax=ax3,
                    xticklabels=False, yticklabels=False)
        ax4.set_xlabel('GC percentage')
        plt.tight_layout()
        plt.savefig(savename, dpi=200)
        if show is True:
            plt.show()
        return True
Beispiel #3
0
def calculate_phi(list1, list2):
    delta_x = 0.1
    bins = np.arange(0, 1 + delta_x, delta_x)
    hist = np.histogram2d(list1, list2, bins=bins, normed='sample_count')

    hist_x, x = np.histogram(list1, bins=hist[1], density=True)
    hist_y, y = np.histogram(list2, bins=hist[2], density=True)
    reverse_h = []
    for i in range(len(hist[0])):
        new = [
            hist[0][i][j] / hist_x[j] / hist_y[i]
            for j in range(len(hist[0][i]))
        ]
        #new = hist[0][i]
        reverse_h = [new] + reverse_h
    reverse_h = np.array(reverse_h)

    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_axes([0.3, 0.3, 0.5, 0.5])
    sns.heatmap(reverse_h,
                cmap="bwr",
                vmin=0.0,
                vmax=2.2,
                center=1.1,
                cbar=True,
                xticklabels=x[:-1],
                yticklabels=sorted(y[:-1], reverse=True))
 def render(variable1, variable2, name):
     plt.subplot(1, 2, 1)
     plt.title(name+' values')
     sns.heatmap(variable1)
     plt.subplot(1, 2, 2)
     plt.title(name+' errors')
     sns.heatmap(variable2)
     plt.show()
Beispiel #5
0
def graph_correl_features(dataset, df):
    """
    generates the graph of correlated features (heatmap matrix)

    :param dataset: dataset object
    :param df: data (as a dataframe)
    :return: None
    """
    try:
        # convert categorical to numerical
        for col in dataset.cat_cols:
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col].map(str))

        # create correlation matrix with pandas
        corr = df.corr()

        # display heatmap
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                if dataset.n_cols > 50:
                    plt.figure(figsize=(10, 10))
                elif dataset.n_cols > 20:
                    plt.figure(figsize=(8, 8))
                elif dataset.n_cols > 10:
                    plt.figure(figsize=(7, 7))
                else:
                    plt.figure(figsize=(6, 6))
                sns.heatmap(corr,
                            mask=np.zeros_like(corr, dtype=np.bool),
                            cmap=sns.diverging_palette(220, 10, as_cmap=True),
                            square=True)

                plt.title('correlation map of the features')
                plt.xticks(rotation=90)
                plt.yticks(rotation=0)
                plt.savefig(get_dataset_folder(dataset.dataset_id) +
                            '/graphs/_correl.png',
                            transparent=TRANSPARENT)
                __save_fig(dataset.dataset_id, '_correl', dark)
    except:
        log.error('error in graph_correl_features with dataset_id %s' %
                  dataset.dataset_id)
Beispiel #6
0
def plot_activity_matrix(df,
                         cmap,
                         normalized=False,
                         annotate=True,
                         out_path='',
                         title=''):
    """
    Plot activity matrix showing area of land transitioning between land-use types
    :param df:
    :param cmap:
    :param normalized:
    :param annotate:
    :param out_path:
    :param title:
    :return:
    """
    logger.info('Plot activity matrix')
    sns.set(font_scale=0.8)

    formatter = tkr.ScalarFormatter(useMathText=True)
    # normalized scale is from 0 - 100, does not need scientific scale
    if not normalized:
        formatter.set_scientific(True)
        formatter.set_powerlimits((-2, 2))

    df = df * 100.0 if normalized else df * 1.0
    vmin = math.ceil(np.nanmin(df))
    vmax = math.ceil(np.nanmax(df))  # maximum value on colorbar
    ax = sns.heatmap(df,
                     cbar_kws={'format': formatter},
                     cmap=cmap,
                     linewidths=.5,
                     linecolor='lightgray',
                     annot=annotate,
                     fmt='.2g',
                     annot_kws={'size': 6},
                     vmin=vmin,
                     vmax=vmax)
    # for annotation of heat map cells, use: annot=True, fmt='g', annot_kws={'size': 6}
    # ax.invert_yaxis()
    ax.set_ylabel('FROM')
    ax.set_xlabel('TO')

    ax.set_title(title)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=0)
    locs, labels = plt.yticks()
    plt.setp(labels, rotation=0)

    plt.savefig(out_path, dpi=constants.DPI)
    plt.close()

    # revert matplotlib params
    sns.reset_orig()
    set_matplotlib_params()
    get_colors(palette='tableau')
Beispiel #7
0
def graph_classification_categorical(dataset_id, df, col, target):
    """
    display a heatmap of col in x axis and target in y axis

    :param dataset_id: id of the dataset
    :param df: dataframe, with col and target values
    :param col: name of column
    :param target: name of target column
    :return:
    """
    try:
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                df['count'] = 1
                plt.figure(figsize=(8, 7))
                # convert col and target in numerical
                encoder = LabelEncoder()
                x = encoder.fit_transform(df[col].values)
                x_labels = encoder.inverse_transform(list(range(max(x) + 1)))
                y = encoder.fit_transform(df[target].values)
                y_labels = encoder.inverse_transform(list(range(max(y) + 1)))
                data = pd.pivot_table(df[[col, target, 'count']],
                                      values='count',
                                      index=target,
                                      columns=col,
                                      aggfunc=np.sum)
                sns.heatmap(data=data,
                            cmap=sns.diverging_palette(220, 10, as_cmap=True),
                            square=True)
                plt.xticks([x + 0.5 for x in list(range(max(x) + 1))],
                           x_labels,
                           rotation=90)
                plt.yticks([x + 0.5 for x in list(range(max(y) + 1))],
                           y_labels,
                           rotation=0)
                __save_fig(dataset_id, '_col_' + col, dark)
    except:
        log.error('error in classification_categorical with dataset_id %s' %
                  dataset_id)
    def plot_links_heatmap(self):
        '''

        :return: a hashmap of path to delay array values
        '''
        links = [[0 for i in gl_mote_range] for m in gl_mote_range]

        for pkt in self.packets:
            for idx, hop in enumerate(pkt.hop_info):
                src = hop['addr']
                if idx == (len(pkt.hop_info)-1):
                    dst = 1
                else:
                    dst = pkt.hop_info[idx+1]['addr']
                links[src-1][dst-1] += 1

        plt.figure()
        heatmap(data=links, xticklabels=[i for i in gl_mote_range], yticklabels=[i for i in gl_mote_range])

        if gl_save:
            plt.savefig(gl_image_path+re.findall(r"(.+?)\.log", self.filename.split('/')[-1])[0]+'_link_load.png',
                        format='png', bbox='tight')
Beispiel #9
0
def heatmap_overlay(data,
                    overlay_image=None,
                    cmap='jet',
                    cbar=False,
                    show_axis=False,
                    alpha=0.5,
                    **kwargs):
    fig, ax = tfplot.subplots(figsize=(5, 4) if cbar else (4, 4))
    fig.subplots_adjust(0, 0, 1, 1)  # use tight layout (no margins)
    ax.axis('off')

    if overlay_image is None: alpha = 1.0
    sns.heatmap(data, ax=ax, alpha=alpha, cmap=cmap, cbar=cbar, **kwargs)

    if overlay_image is not None:
        h, w = data.shape
        ax.imshow(overlay_image, extent=[0, h, 0, w])

    if show_axis:
        ax.axis('on')
        fig.subplots_adjust(left=0.1, bottom=0.1, right=0.95, top=0.95)
    return fig
Beispiel #10
0
def plot_activity_matrix(df, cmap, normalized=False, annotate=True, out_path='', title=''):
    """
    Plot activity matrix showing area of land transitioning between land-use types
    :param df:
    :param cmap:
    :param normalized:
    :param annotate:
    :param out_path:
    :param title:
    :return:
    """
    logger.info('Plot activity matrix')
    sns.set(font_scale=0.8)

    formatter = tkr.ScalarFormatter(useMathText=True)
    # normalized scale is from 0 - 100, does not need scientific scale
    if not normalized:
        formatter.set_scientific(True)
        formatter.set_powerlimits((-2, 2))

    df = df * 100.0 if normalized else df * 1.0
    vmin = math.ceil(np.nanmin(df))
    vmax = math.ceil(np.nanmax(df))  # maximum value on colorbar
    ax = sns.heatmap(df, cbar_kws={'format': formatter}, cmap=cmap,
                     linewidths=.5, linecolor='lightgray', annot=annotate, fmt='.2g', annot_kws={'size': 6}, vmin=vmin,
                     vmax=vmax)
    # for annotation of heat map cells, use: annot=True, fmt='g', annot_kws={'size': 6}
    # ax.invert_yaxis()
    ax.set_ylabel('FROM')
    ax.set_xlabel('TO')

    ax.set_title(title)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=0)
    locs, labels = plt.yticks()
    plt.setp(labels, rotation=0)

    plt.savefig(out_path, dpi=constants.DPI)
    plt.close()

    # revert matplotlib params
    sns.reset_orig()
    set_matplotlib_params()
    get_colors(palette='tableau')
Beispiel #11
0
def plot_correlations(data, save=False, savepath="", **kwds):
    """Calculate pairwise correlation between features.

    Extra arguments are passed on to DataFrame.corr()
    """

    import seaborn.apionly as sns

    plt.clf()

    # simply call df.corr() to get a table of
    # correlation values if you do not need
    # the fancy plotting
    corrmat = data.corr()
    labels = corrmat.columns.values
    labels = [labels[i] for i in range(len(labels))]

    opts = {'cmap': 'YlGnBu_r', 'vmin': -1, 'vmax': 1}

    ax1 = sns.heatmap(corrmat, linewidths=.5, **opts)
    ax1.set_title('Correlations')

    for ax in (ax1, ):
        ax.set_xticks(np.arange(len(labels)) + .5, minor=False)
        ax.set_yticks(np.arange(len(labels)) + .5, minor=False)
        ax.set_xticklabels(labels, minor=False, rotation=70, ha='right')
        ax.set_yticklabels(labels, minor=False, rotation='horizontal')

    plt.tight_layout()

    if save:
        filename = 'correlations-' + datetime.datetime.now().strftime(
            '%Y-%m-%d_%H-%M-%S') + '.pdf'
        fig_path = os.path.join(savepath_base, filename)
        figure_handler(save, fig_path)
    elif not save:
        figure_handler(save)
Beispiel #12
0
def corr_heatmap(x, mask_half=True, cmap='RdYlGn_r', vmin=-1, vmax=1,
                 linewidths=0.5, square=True, figsize=(10,10), **kwargs):
    """Wrapper around seaborn.heatmap for visualizing correlation matrix.

    Parameters
    ==========
    x : DataFrame
        Underlying data (not a correlation matrix)
    mask_half : bool, default True
        If True, mask (whiteout) the upper right triangle of the matrix
    All other parameters passed to seaborn.heatmap:
    https://seaborn.pydata.org/generated/seaborn.heatmap.html

    Example
    =======
    %matplotlib inline

    # Generate some correlated data
    k = 10
    size = 400
    mu = np.random.randint(0, 10, k).astype(float)
    r = np.random.ranf(k ** 2).reshape((k, k)) * 5
    df = pd.DataFrame(np.random.multivariate_normal(mu, r, size=size))

    corr_heatmap(df)
    """

    if mask_half:
        mask = np.zeros_like(x.corr().values)
        mask[np.triu_indices_from(mask)] = True
    else:
        mask = None

    with sns.axes_style('white'):
        return sns.heatmap(x.corr(), cmap=cmap, vmin=vmin, vmax=vmax,
                    linewidths=linewidths, square=square, mask=mask, **kwargs)
Beispiel #13
0
            for l in range(1, a + 1):
                listaIndex.append(i + l)
        break
    else:
        continue
Means = Means.ix[listaIndex]
Beh = Beh.ix[listaIndex]
Means.index = range(-a, a)
Means2 = Means2.ix[listaIndex]
Means2.index = range(-a, a)
''' 
Heat Map for select neurons that change in significant way 
'''
fig, ax = plt.subplots(figsize=(10, 20))
ax.vlines([49 + 0.5], 0, 1, transform=ax.get_xaxis_transform(), colors='k')
sns.heatmap(Means.transpose(), annot=False, xticklabels=1)
'''
Correlation between mean1,mean32,mean13,mean4 and all others neurons.
Save only neurons that have high positive correlation
Select and graphs only neurons that are present more than 1 time
'''
listMean = ('Mean(13)', 'Mean(32)', 'Mean(4)')
listCorr = []
for el in listMean:
    for column in Means:
        if column != el:
            p = Means[el].corr(Means[column])
            tupla = (p.round(1), column)
            listCorr.append(tupla)
highpositivecorr = []
highpositiveMean = []
            # training dataset
            X_train = X.loc[lambda df: df['age'] > train_cutoff]
            y_train = y.loc[X_train.index]
            # holdout dataset
            X_holdout = X.loc[lambda df: df['age'] <= holdout_cutoff]
            y_holdout = y.loc[X_holdout.index]

            # fit/predict
            clf = LogisticRegression().fit(X_train, y_train)
            y_pred = clf.predict_proba(X_holdout)[:, 1]
            auc = roc_auc_score(y_holdout, y_pred)
            # cache auc
            auc_df.at[train_cutoff, holdout_cutoff] = auc

        plt.figure(figsize=(6, 3))
        sns.heatmap(auc_df.fillna(0).loc[auc_df.index[::-1]], )
        plt.ylabel('Training set > age')
        plt.xlabel('Testing set < age')
        plt.title('AUCs of Models Trained on\nVarious dataset splits')
        plt.savefig(output_dir /
                    '2018-08-11__aucs-of-age-based-dataset-splits.png')

    # ### Generate AUC scores for vanilla vs. hardweighted models
    def cross_val_scores_weighted(model,
                                  X,
                                  y,
                                  weights,
                                  cv=5,
                                  metrics=[sklearn.metrics.roc_auc_score]):
        kf = KFold(n_splits=cv)
        kf.get_n_splits(X)
Beispiel #15
0
                  linestyles="-.",
                  color=colors[1],
                  linewidths=2)
    _ = ax.set_title("Posterior predictive distribution of AUC measurement \n \
        Classifier: {0} ".format(name))
    _ = ax.legend(loc="upper left")

    # Create an empty dataframe
    ab_dist_df = pd.DataFrame(index=range(m1),
                              columns=range(m2),
                              dtype=np.float)

    # TODO!!!!!!!!
    def prob_score_higher(s_a, s_b):
        return (s_a - s_b)

    # populate each cell in dataframe with persona_less_personb()
    for a, b in itertools.product(range(m1), range(m2)):
        ab_dist_df.ix[a, b] = prob_score_higher(pos_score[a], neg_score[b])

    ax = plt.subplot(l, 2, i)
    i += 1
    cmap = plt.get_cmap("Spectral")
    _ = sns.heatmap(ab_dist_df, square=True, cmap=cmap, linecolor='white')
    _ = ax.set_title("Positive samples score higher than negative samples \n \
        Classifier: {0} ".format(name))
    _ = ax.set_ylabel("positive samples")
    _ = ax.set_xlabel("negative samples")

figure.subplots_adjust(left=.04, right=.96, bottom=.05, top=.95)
plt.show()
def test_edge_imputation():
    constraints = {'edge_count': (1000, 1100)}
    accuracy_at_k = [0] * 5

    confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)]
    samples = 100
    index = [
        'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert',
        'Planted Partition Model'
    ]
    constraints_enforced = False
    rgs = [
        structural_identities.watts_strogatz_generator,
        structural_identities.geometric_generator,
        structural_identities.erdos_renyi_generator,
        structural_identities.barabasi_albert_generator,
        structural_identities.planted_partition_generator
    ]

    for uni, rg in enumerate(rgs):
        title = index[uni]
        actual = uni
        created_graphs = []
        for i in xrange(samples):
            G = structural_identities.constrained_generation(rg, constraints)

            degree_sequence = [1] * G.number_of_nodes()

            new_G = random_graphs.configuration_model(degree_sequence)
            new_G = impute_edge_algorithm(new_G, G)
            created_graphs.append(new_G)

            cluster, types = predict_structure(new_G, 2, constraints_enforced)

            predicted = cluster.index(min(cluster))
            print title, types[predicted]

            confusion_matrix[actual][predicted] += 1

            array = np.array(cluster)
            order = array.argsort()
            ranks = order.argsort().tolist()

            k = -1
            for i in xrange(len(cluster)):  # 5 types of rg
                if title == types[ranks.index(i)]:
                    k = i
                    break

            j = len(cluster) - 1
            while j >= k:
                accuracy_at_k[j] += 1
                j -= 1

        # HERE we plot distros
        observed_metrics, dic = structural_identities.analyze_structural_identity_graphs(
            created_graphs, uni)
        predict_metrics, dic = structural_identities.analyze_structural_identity(
            rg, samples, uni)  # constraints=None):
        structural_identities.graph_created_distributions(
            uni, observed_metrics, predict_metrics, dic)

    small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM']

    plt.figure(10)

    for i in xrange(len(accuracy_at_k)):
        accuracy_at_k[i] /= (samples * 1.0 * len(rgs))

    if constraints_enforced:
        plt.plot([i for i in xrange(1, 6)],
                 accuracy_at_k,
                 marker='o',
                 color='red')
    else:
        plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o')

    plt.xlabel('k (top k labels)')
    plt.ylim((0, 1.1))
    plt.ylabel('Accuracy @ k')
    plt.title('Prediction Accuracy for Uniformly Sampled Random Graphs')

    plt.show()

    sns.set()
    ax = plt.axes()
    sns.heatmap(confusion_matrix,
                ax=ax,
                cmap="YlGnBu",
                yticklabels=index,
                xticklabels=small_index)
    ax.set_title('Confusion Matrix for Uniformly Sampled Random Graphs')
    plt.tight_layout()
    plt.show()
def perform_edge_imputation():
    accuracy_at_removed = []
    remove_probability = [0.1 * i for i in xrange(10)]

    constraints = {'edge_count': (1000, 1100)}

    samples = 2
    index = [
        'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert',
        'Planted Partition Model'
    ]
    constraints_enforced = False
    rgs = [
        structural_identities.watts_strogatz_generator,
        structural_identities.geometric_generator,
        structural_identities.erdos_renyi_generator,
        structural_identities.barabasi_albert_generator,
        structural_identities.planted_partition_generator
    ]

    for p in remove_probability:
        correct = 0.0
        accuracy_at_k = [0] * 5
        confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)]

        for uni, rg in enumerate(rgs):
            title = index[uni]
            actual = uni
            for i in xrange(samples):
                G = structural_identities.constrained_generation(
                    rg, constraints)

                new_G = deepcopy(G)
                new_G = remove_edges(new_G, p)
                new_G = impute_edge_algorithm(new_G, G)

                cluster, types = predict_structure(new_G, 1,
                                                   constraints_enforced)

                predicted = cluster.index(min(cluster))
                print title, types[predicted]
                if actual == predicted:
                    correct += 1

                confusion_matrix[actual][predicted] += 1

                array = np.array(cluster)
                order = array.argsort()
                ranks = order.argsort().tolist()

                k = -1
                for i in xrange(len(cluster)):  # 5 types of rg
                    if title == types[ranks.index(i)]:
                        k = i
                        break

                j = len(cluster) - 1
                while j >= k:
                    accuracy_at_k[j] += 1
                    j -= 1
        small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM']

        plt.figure(10)

        sns.set()
        ax = plt.axes()
        sns.heatmap(confusion_matrix,
                    ax=ax,
                    cmap="YlGnBu",
                    yticklabels=index,
                    xticklabels=small_index)
        ax.set_title('Confusion Matrix for Edge Imputed Graphs (' +
                     str((p) * 100) + ' percent removed)')
        plt.tight_layout()
        plt.savefig(
            '/Users/Brennan/Desktop/Networks/networks-project/pictures/CM_' +
            str((p) * 100) + '_removed.png')
        plt.close()

        sns.reset_defaults()
        imp.reload(mpl)
        imp.reload(plt)
        imp.reload(sns)
        # import matplotlib as mpl
        # import matplotlib.pyplot as plt

        for i in xrange(len(accuracy_at_k)):
            accuracy_at_k[i] /= (samples * 1.0 * len(rgs))

        if constraints_enforced:
            plt.plot([i for i in xrange(1, 6)],
                     accuracy_at_k,
                     marker='o',
                     color='red')
        else:
            plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o')

        plt.xlabel('k (top k labels)')
        plt.ylim((0, 1.1))
        plt.ylabel('Accuracy @ k')
        plt.title('Prediction Accuracy for Edge Imputed Graphs (' +
                  str((p) * 100) + ' percent removed)')
        plt.savefig(
            '/Users/Brennan/Desktop/Networks/networks-project/pictures/PA_' +
            str((p) * 100) + '_removed.png')
        plt.tight_layout()
        plt.close()

        accuracy_at_removed.append(correct / (len(rgs) * samples))

    plt.plot(remove_probability, accuracy_at_removed, marker='o')
    plt.xlabel('Percent of Edges Removed')
    plt.ylim((0, 1.1))
    plt.ylabel('Accuracy @ 1')
    plt.title('Prediction Accuracy for Graph Recovery (Edge Imputation)')
    plt.savefig(
        '/Users/Brennan/Desktop/Networks/networks-project/pictures/graph_imputation_forall_p.png'
    )
    plt.clf()
    plt.savefig(outfile)

    d = pd.DataFrame(df, columns=feature_list)
    # Compute the correlation matrix
    corr = d.corr()
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    fig, ax = plt.subplots()
    sns.heatmap(corr,
                mask=mask,
                cmap='RdBu_r',
                center=0,
                square=True,
                xticklabels=feature_labels,
                yticklabels=feature_labels,
                linewidths=.5,
                cbar_kws={'label': 'Covariance'},
                annot=True,
                ax=ax)
    outfile = args.outdir + '/feature_covariance.png'
    plt.savefig(outfile)

    print('=' * 30)
    print(name)
    # test_predictions = pipeline.predict(X_test_std)
    # test_acc = accuracy_score(y_test, test_predictions)
    # print('Test accuracy: {:.4%}'.format(test_acc))
    # train_predictions = pipeline.predict(X_train_std)
    # train_acc = accuracy_score(y_train, train_predictions)
Beispiel #19
0
             height=4,
             diag_kind='kde',
             plot_kws=dict(color=colors[8]),
             diag_kws=dict(shade=True, alpha=.7, color=colors[0]))
plt.show()
# -

# Visualizzazione della matrice di correlazione. Alla posizione $(i,j)$ il coefficiente di correlazione (lineare) tra le feature $i$ e $j$. Valore in $[-1,1]$: $1$ correlazione perfetta, $-1$ correlazione inversa perfetta, $0$ assenza di correlazione

cm = np.corrcoef(df[cols].values.T)
plt.figure(figsize=(12, 4))
hm = sns.heatmap(cm,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.2f',
                 annot_kws={'size': 10},
                 yticklabels=cols,
                 xticklabels=cols,
                 cmap=cmap)
plt.tight_layout()
plt.show()

# ### Regressione di MEDV rispetto a una sola feature

print("Feature utilizzabili: {0}".format(', '.join(map(str, df.columns[:-1]))))

mi = mutual_info_regression(df[df.columns[:-1]], df[df.columns[-1]])
dmi = pd.DataFrame(mi, index=df.columns[:-1],
                   columns=['mi']).sort_values(by='mi', ascending=False)
dmi.head(20)
Beispiel #20
0
        serieB = serieB.append(serieMT.loc[i,
                                           -len(serieMT.transpose()) / 2:-1])
        serieA = serieA.append(serieMT.loc[i, 0:len(serieMT.transpose())] / 2)
    return (serieB, serieA)


meanTupla = DataFrameAandB(MeansS)
meanBefore = meanTupla[0]
meanAfter = meanTupla[1]
'''
Function for normalization
'''


def Normalization(dfA, dfB):
    for index, row in dfB.iterrows():
        m = dfB.loc[index, :].mean()
        dfA.loc[index, :] = dfA.loc[index, :] / m
        dfB.loc[index, :] = dfB.loc[index, :] / m
    return (dfA, dfB)


DataA, DataB = Normalization(meanAfter, meanBefore)
Data = DataB.join(DataA)
'''
 HM
'''
fig, ax = plt.subplots(figsize=(20, 5))

sns.heatmap(Data, annot=False, xticklabels=1, vmin=-15, vmax=15, cmap="BuPu")
corr = df.corr()
print(corr)
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, ax = plt.subplots()

# Draw the heatmap with the mask and correct aspect ratio
vmax = np.abs(corr.values[~mask]).max()
sns.heatmap(corr,
            mask=mask,
            cmap=plt.cm.PuOr,
            vmin=-vmax,
            vmax=vmax,
            square=True,
            linecolor="lightgray",
            linewidths=1,
            ax=ax)
for i in range(len(corr)):
    ax.text(i + 0.5,
            len(corr) - (i + 0.5),
            corr.columns[i],
            ha="center",
            va="center",
            rotation=45)
    for j in range(i + 1, len(corr)):
        s = "{:.3f}".format(corr.values[i, j])
        ax.text(j + 0.5, len(corr) - (i + 0.5), s, ha="center", va="center")
ax.axis("off")
Beispiel #22
0
    for index, row in dfB.iterrows():
        m = dfB.loc[index, :].mean()
        dfA.loc[index, :] = dfA.loc[index, :] / m
        dfB.loc[index, :] = dfB.loc[index, :] / m
    return (dfA, dfB)


DataA, DataB = Normalization(meanAfter, meanBefore)
Data = DataB.join(DataA)
'''
Kmeans and HM
'''
km = KMeans(n_clusters=2, init='k-means++', n_init=20)
km.fit(Data)
x = km.fit_predict(Data)
Data['Cluster'] = x
Data = Data.sort_values(by=['Cluster'])
df2 = Data.drop('Cluster', 1)
fig, ax = plt.subplots(figsize=(20, 5))
#ax.vlines([20],0,1, transform=ax.get_xaxis_transform(), colors='k')
df2 = df2.drop('Mean(34)', 0)
df2 = df2.drop('Mean(20)', 0)
df2 = df2.drop('Mean(11)', 0)
df2 = df2.drop('Mean(2)', 0)
df2 = df2.drop('Mean(10)', 0)
df2 = df2.drop('Mean(29)', 0)
df2 = df2.drop('Mean(32)', 0)
df2 = df2.drop('Mean(36)', 0)
df2 = df2.drop('Mean(17)', 0)
sns.heatmap(df2, annot=False, xticklabels=1, vmin=-10, vmax=10, cmap="BuPu")
def run_predict_structure(generator=None, title=None):
    constraints = {'edge_count': (1000, 1100)}

    accuracy_at_k = [0] * 5

    if generator != None and title != None:
        samples = 100
        for sample in xrange(samples):
            G = structural_identities.constrained_generation(
                generator, constraints)
            cluster, types = predict_structure(G, trials=20)

            print sample, types[cluster.index(min(cluster))]

            array = np.array(cluster)
            order = array.argsort()
            ranks = order.argsort().tolist()

            k = -1
            for i in xrange(len(cluster)):  # 5 types of rg
                if title == types[ranks.index(i)]:
                    k = i
                    break

            j = len(cluster) - 1
            while j >= k:
                accuracy_at_k[j] += 1
                j -= 1

        plt.figure(1)

        for i in xrange(len(accuracy_at_k)):
            accuracy_at_k[i] /= (samples * 1.0)

        plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o')
        plt.xlabel('k (top k labels)')
        plt.ylim((0, 1.1))
        plt.ylabel('Accuracy @ k')
        plt.title('Prediction Accuracy for ' + title + ' Random Graphs')

        plt.show()

    # Uniformly sample across rg
    elif generator == None:
        confusion_matrix = [[0 for i in xrange(5)] for j in xrange(5)]
        samples = 100
        index = [
            'Watts Strogatz', 'Geometric', 'Erdos Renyi', 'Barabasi Albert',
            'Planted Partition Model'
        ]
        constraints_enforced = True
        rgs = [
            structural_identities.watts_strogatz_generator,
            structural_identities.geometric_generator,
            structural_identities.erdos_renyi_generator,
            structural_identities.barabasi_albert_generator,
            structural_identities.planted_partition_generator
        ]

        for j, rg in enumerate(rgs):
            title = index[j]
            actual = j
            for i in xrange(samples):
                G = structural_identities.constrained_generation(
                    rg, constraints)

                cluster, types = predict_structure(G, 5, constraints_enforced)

                predicted = cluster.index(min(cluster))
                print title, types[predicted]

                confusion_matrix[actual][predicted] += 1

                array = np.array(cluster)
                order = array.argsort()
                ranks = order.argsort().tolist()

                k = -1
                for i in xrange(len(cluster)):  # 5 types of rg
                    if title == types[ranks.index(i)]:
                        k = i
                        break

                j = len(cluster) - 1
                while j >= k:
                    accuracy_at_k[j] += 1
                    j -= 1

        small_index = ['WS', 'Geo', 'ER', 'BA', 'PPM']

        for i in xrange(len(accuracy_at_k)):
            accuracy_at_k[i] /= (samples * 1.0 * len(rgs))

        print accuracy_at_k

        if constraints_enforced:
            plt.plot([i for i in xrange(1, 6)],
                     accuracy_at_k,
                     marker='o',
                     color='red')
        else:
            plt.plot([i for i in xrange(1, 6)], accuracy_at_k, marker='o')
        plt.xlabel('k (top k labels)')
        plt.ylim((0, 1.1))
        plt.ylabel('Accuracy @ k')
        plt.title('Prediction Accuracy for Uniformly Sampled Random Graphs')

        plt.show()

        sns.set()
        ax = plt.axes()
        sns.heatmap(confusion_matrix,
                    ax=ax,
                    cmap="YlGnBu",
                    yticklabels=index,
                    xticklabels=small_index)
        ax.set_title('Confusion Matrix for Uniformly Sampled Random Graphs')
        plt.tight_layout()
        plt.show()
Beispiel #24
0
Data = Data.drop('Mean(8)', 0)
Data = Data.drop('Mean(28)', 0)
Data = Data.drop('Mean(2)', 0)
'''
Kmeans and HM
'''
km = KMeans(n_clusters=3, init='k-means++', n_init=20)
km.fit(Data)
x = km.fit_predict(Data)
Data['Cluster'] = x
Data = Data.sort_values(by=['Cluster'])
df2 = Data.drop('Cluster', 1)
fig, ax = plt.subplots(figsize=(20, 10))
#ax.vlines([50],0,1, transform=ax.get_xaxis_transform(), colors='k')
ax.hlines([30], 0, 1, transform=ax.get_yaxis_transform(), colors='r')
#ax.hlines([34],0,1, transform=ax.get_yaxis_transform(), colors='k')

#df2 = df2.drop ('Mean(11)',0)
#df2 = df2.drop ('Mean(2)', 0)
#df2 = df2.drop ('Mean(10)',0)
#df2 = df2.drop ('Mean(29)',0)
#df2 = df2.drop ('Mean(32)',0)
#df2 = df2.drop ('Mean(36)',0)
#df2 = df2.drop ('Mean(17)',0)
sns.heatmap(
    df2,
    annot=False,
    xticklabels=1,
    yticklabels=1,
    cmap="BuPu",
)
Beispiel #25
0
# normalize confusion matrices
normalizedAvgCM = np.zeros((numClasses, numClasses))
for i in range(len(confusionMatrices)):
    cm = confusionMatrices[i]
    normalizedAvgCM += cm / cm.astype(np.float).sum(axis=1)

normalizedAvgCM = normalizedAvgCM / nfold
# plot one time prediction confusion matrix
df_cm = pd.DataFrame(normalizedAvgCM, index=classNames, columns=classNames)
plt.figure(figsize=(9.6, 4.1))  # 5.7
sns.set(font_scale=1.4)  # for label size
ax = sns.heatmap(
    df_cm,
    cbar_kws={'ticks': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
    vmin=0,
    vmax=1.0,
    annot=True,
    annot_kws={"size": 18},
    fmt='2.2f',
    cmap="Blues")  # font size
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
ax.set_ylim(sorted(ax.get_xlim(), reverse=True))
ax.set_yticklabels(classNames, rotation=0, fontsize="16", va="center")
ax.set_xticklabels(classNames, rotation=0, fontsize="16", ha="center")
plt.tight_layout()
plt.savefig('FIGURES/normCM_cnn_10fold_4_classes_old.pdf')

with plt.style.context("default"):
    plt.figure(figsize=(10, 5))
    for iplt in range(2):
d = d.drop('Mean(34)', 0)
d = d.drop('Mean(35)', 0)
d = d.drop('Mean(23)', 0)
d = d.drop('Mean(1)', 0)
'''
Kmeans and HM
'''
km = KMeans(n_clusters=6, init='k-means++', n_init=100)
km.fit(d)
x = km.fit_predict(d)
d['Cluster'] = x
d = d.sort_values(by=['Cluster'])

t = d.drop('Cluster', 1)

fig, ax = plt.subplots(figsize=(25, 10))

ax.vlines([a + 0.5], 0, 1, transform=ax.get_xaxis_transform(), colors='k')
#ax.hlines([3],1,0, transform=ax.get_yaxis_transform(), colors='k')
#ax.hlines([10],1,0, transform=ax.get_yaxis_transform(), colors='k')
#ax.hlines([12],1,0, transform=ax.get_yaxis_transform(), colors='k')
#ax.hlines([21],1,0, transform=ax.get_yaxis_transform(), colors='k')
#ax.hlines([25],1,0, transform=ax.get_yaxis_transform(), colors='k')
##ax.hlines([22],1,0, transform=ax.get_yaxis_transform(), colors='k')
##ax.hlines([15],1,0, transform=ax.get_yaxis_transform(), colors='k')
##ax.hlines([20],1,0, transform=ax.get_yaxis_transform(), colors='k')
##ax.hlines([21],1,0, transform=ax.get_yaxis_transform(), colors='k')
##ax.hlines([24],1,0, transform=ax.get_yaxis_transform(), colors='k')

sns.heatmap(t, annot=False, xticklabels=1, yticklabels=1, cmap="YlGnBu")
Beispiel #27
0
# fare and Survived
fig.add_subplot(3, 3, 9)
sns.violinplot(x="Survived", y="Fare", alpha=.7, data=df, saturation=.7)

plt.show()

# In[425]:

cm = df.drop(['PassengerId'], axis=1).corr()
mask = np.zeros_like(cm, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(16, 8))
hm = sns.heatmap(cm,
                 mask=mask,
                 annot=True,
                 fmt='.2f',
                 cmap=sns.diverging_palette(220, 10, as_cmap=True),
                 cbar_kws={"shrink": .5})
plt.tight_layout()
plt.show()

# ### Let us evaluate some conditional probabilities of surviving

# In[353]:


def cond_prob(feature):
    r = df[df[feature].notnull()][feature].unique()
    p = []
    for val in r:
        joint = df[(df[feature] == val) & (df['Survived'] == 1)].shape[0]
def occurrence_count(keywords):

    #Remove duplicates
    final_count_df = pd.read_csv('final_count.csv')
    final_count_df.drop_duplicates(subset=['tweetID'], keep='first')
    del final_count_df['i']
    ocurrences_df = final_count_df.copy()

    # final_count_df.to_csv('final_count2.csv', index=False)
    for index, row in ocurrences_df.iterrows():
        ocurrences_df.loc[index, "date"] = datetime.fromtimestamp(
            float(row['date']) / 1000.0)

    del ocurrences_df['tweetID']
    del ocurrences_df['lat']
    del ocurrences_df['lon']
    del ocurrences_df['userID']
    ocurrences_df.index = pd.DatetimeIndex(ocurrences_df['date'].values)
    dfrs = ocurrences_df.resample('30min').apply(sum)
    dfrs.fillna(0, inplace=True)
    print(dfrs.head())
    # fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # x = list(dfrs.index.values)
    # y = list(dfrs.columns.values)
    # z = dfrs.values
    weeks = [g for n, g in dfrs.groupby(pd.TimeGrouper('W'))]
    # pd.DataFrame.index.
    # plt.figure()

    # print(len(weeks))
    w = 0
    for week_data in weeks:
        # print(week_data.std())
        high_var_df = week_data.loc[:, week_data.std() > .1]
        high_var_df['match'] = 0
        # print(high_var_df.head())
        if w == 0:
            # for index, row in high_var_df.iterrows():
            # row['match'] = False
            pass
        elif w == 1:
            high_var_df[329:334]['match'] = 1
            # for index, row in high_var_df.iterrows():
            #     row['match'] = 1 if row['madrid'] > 2 and row['barcelona'] > 2 else 0
            # row['match'] = 1 if row['barcelona'] > 5 else 0
            #331->335

            # high_var_df.loc[index] = row
        elif w == 2:
            high_var_df[136:140]['match'] = 1
            # for index, row in high_var_df.iterrows():
            #     row['match'] = 1 if row['barcelona'] > 2 and row['villarreal'] > 2 else 0
            #     # row['match'] = 1 if row['barcelona'] > 5 else 0
            #     #138->141
            #     high_var_df.loc[index] = row
        high_var_df.to_csv(str(high_var_df.index[0]) + '.csv')
        plt.figure()
        for name, series in high_var_df.iteritems():
            plt.plot(series, high_var_df.index)
        plt.figure()
        ax1 = sns.heatmap(high_var_df)
        plt.setp(ax1.xaxis.get_majorticklabels(), rotation=90)
        plt.tight_layout()
        plt.figure()
        ax2 = sns.heatmap(high_var_df.corr())
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=90)
        plt.tight_layout()
        # print(x, y, z)
        # input('ENTER')
        model_train(high_var_df, w, 'linear_regression')
        w += 1

    plt.pause(0.001)
    input('ENTER')
    plt.close()
    dfrs.to_csv('occurrences.csv', index=True)
plt.ylabel("frequency")


# In[15]:


data_mean.corr()
# The most correlated features are texture:radius and perimeter:fractal dimension. It is not good to have correlated features because they can be redundant and slow down the program. They also can increase bias. Based on this knowledge, several features which are highly correlated may be removed. For this project, I will keep only the mean values. 


# In[50]:


# Create a heat map of the correlations between the mean values. Red means highly correlated and blue is uncorrelated.
plt.figure(figsize=(10,10))
sns.heatmap(data_mean.corr(),annot=True,square=True,cmap='coolwarm')


# In[53]:


# This makes a new datafram which removes the column that tells whether it is malignant or benign. 
y = data.type
data_p=data.drop(columns="type") #create a new data array
data_p.head()


# In[57]:


data_pair = data_mean.drop(columns=["ID"])