def get_results(X, resultspath, dataset_prefix):
    df = {
        'value': [],
        'observed fraction': [],
        'statistic': [],
        'X_hat': [],
        'mask': []
    }
    FP = glob.glob(os.path.join(resultspath, dataset_prefix, 'completed.*'))
    for fp in FP:
        if '.npy' in fp:
            X_hat = np.load(fp)
        elif '.csv' in fp:
            X_hat = pd.read_csv(fp, index_col=0).values
            # sometimes an extra multiindex to remove
            if X_hat.shape[1] == X.shape[1] + 1:
                X_hat = X_hat[:, 1:].astype('float')
        mask = np.load(fp.replace('completed', 'mask').replace('.csv', '.npy'))
        p = fp.split('obs-')[-1].split('.npy')[0]
        while p.count('.') > 1:
            p = p[:p.rfind('.')]
        p = float(p)
        rmse = calc_unobserved_rmse(X, X_hat, mask)
        r2 = calc_unobserved_r2(X, X_hat, mask)
        df['observed fraction'].append(p)
        df['statistic'].append('r^2')
        df['value'].append(r2)
        df['X_hat'].append(X_hat)
        df['mask'].append(mask)
        df['observed fraction'].append(p)
        df['statistic'].append('rmse')
        df['value'].append(rmse)
        df['X_hat'].append(X_hat)
        df['mask'].append(mask)
    return pd.DataFrame(df)
def plot_scatter(X, X_hat, mask, filename, data_transform, value_name):
    available = np.invert(np.isnan(X.values))
    rmse = calc_unobserved_rmse(X, X_hat.values, mask)
    r2 = calc_unobserved_r2(X, X_hat.values, mask)
    _ = sns.regplot(x=X.values[np.where((1 - mask) * available)],
                    y=X_hat.values[np.where((1 - mask) * available)],
                    x_jitter=.1,
                    scatter_kws={'alpha': 0.25})
    _ = plt.xlabel('True titer (%s %s)' % (data_transform, value_name))
    _ = plt.ylabel('Predicted titer (%s %s)' % (data_transform, value_name))
    _ = plt.title(
        '%.1f%% of entries available; %.1f%% observed; RMSE=%.3f; r^2=%.1f%%' %
        (100 * np.average(available), 100 * np.average(mask), rmse, 100 * r2),
        fontsize=8)
    plt.savefig(filename)
    plt.close()
def plot_heatmap(X, X_hat, mask, filename, data_transform, value_name):
    available = np.invert(np.isnan(X.values))
    rmse = calc_unobserved_rmse(X, X_hat.values, mask)
    r2 = calc_unobserved_r2(X, X_hat.values, mask)
    u, s, vt = np.linalg.svd(X_hat - X_hat.values.mean())
    approx_rank = np.where(np.cumsum(s**2) > (s**2).sum() * 0.95)[0][0] + 1
    correlations = np.asarray(X.corr())
    correlations[np.isnan(correlations)] = 0
    col_linkage = linkage(distance.pdist(correlations), method='average')
    col_order = leaves_list(col_linkage)
    correlations = np.asarray(X.T.corr())
    correlations[np.isnan(correlations)] = 0
    row_linkage = linkage(distance.pdist(correlations), method='average')
    row_order = leaves_list(row_linkage)
    X_reorder = X.reindex(X.index[row_order])[X.columns[col_order]]
    Xhat_reorder = X_hat.reindex(X.index[row_order])[X.columns[col_order]]
    df = pd.concat([X_reorder, Xhat_reorder], keys=['original', 'inferred'])
    try:
        df = df.rename_axis(
            ['Unobserved',
             '%s %s' % (data_transform, value_name)])
    except:
        pass
    df_mask = np.vstack([mask, mask])
    fig, ax = plt.subplots(figsize=(8, 12))
    _ = plt.title(
        '%.1f%% of entries available; %.1f%% observed; RMSE=%.3f; r^2=%.1f%%\nsize: %d x %d; approx. rank: %d'
        % (np.average(available) * 100, np.average(mask) * 100, rmse, r2 * 100,
           X.shape[0], X.shape[1], approx_rank),
        fontsize=10)
    ax = sns.heatmap(df, mask=df_mask, ax=ax, cmap=sns.cm.rocket_r)
    _ = ax.axhline(X.shape[0], color='blue')
    _ = plt.tight_layout()
    bottom, top = ax.get_ylim()
    ax.set_ylim(
        bottom + 0.5, top - 0.5
    )  # sorry, this may cut off the bottom row...some sort of matplotlib bug
    plt.savefig(filename)
    plt.close()
    elif args.data_transform == 'log10':

        def transform(x):
            return np.log10(x)

    X = transform(X)
    if args.specific_mask:
        mask, virus, table_number = get_specific_mask(X, dataset_index_ranges,
                                                      args.virus_table_path,
                                                      int(args.job_id) - 1)
        virus = virus.replace('/', '-')
        print('masking virus %s in table %s' % (virus, table_number))
    else:
        mask = get_mask(X, args.obs_frac)
    X_hat = complete_matrix(X, mask, offset=True)
    rmse = calc_unobserved_rmse(X, X_hat, mask)
    r2 = calc_unobserved_r2(X, X_hat, mask)
    print('RMSE: %.4f' % rmse)
    print('r^2: %.4f' % r2)
    if not os.path.exists(args.savepath):
        os.makedirs(args.savepath)
    if args.job_id is not None:
        if args.specific_mask:
            np.save(
                '%s/completed.%s.%s.npy' %
                (args.savepath, virus, table_number), X_hat)
            np.save('%s/mask.%s.%s.npy' % (args.savepath, virus, table_number),
                    mask)
        else:
            np.save(
                '%s/completed.job-%s.obs-%.4f.npy' %
Esempio n. 5
0
        def transform(x):
            return -np.log10(x)
    elif args.data_transform == 'log10':

        def transform(x):
            return np.log10(x)

    X1 = transform(X1)
    X2 = transform(X2)
    print(X1.shape)
    X1, X2, mask = get_common_subset(X1, X2)
    X_hat = pd.DataFrame(complete_matrix(X1, mask, offset=True),
                         index=X1.index,
                         columns=X1.columns)
    rmse = calc_unobserved_rmse(X2, X_hat.values, mask)
    r2 = calc_unobserved_r2(X2, X_hat.values, mask)
    rmse_obs = calc_observed_rmse(X2, X1.values, mask)
    r2_obs = calc_observed_r2(X2, X1.values, mask)
    rmse_obs_hat = calc_observed_rmse(X2, X_hat.values, mask)
    r2_obs_hat = calc_observed_r2(X2, X_hat.values, mask)
    print(X1.shape)
    print(rmse, r2)
    print(rmse_obs, r2_obs)
    print(rmse_obs_hat, r2_obs_hat)
    savepath_full = '%s/observation_end_%s.prediction_start_%s' % (
        args.savepath, args.max_year, args.min_year)
    if not os.path.exists(savepath_full):
        os.makedirs(savepath_full)
    if args.save_data:
        _ = X1.to_csv(path_or_buf='%s/data.observed.csv' % (savepath_full))