Ejemplo n.º 1
0
def make_plots(pics_lt, dirpath, filename):
    # Load model, sorting components by scale and instantiating distributions from names
    with open(dirpath + '/' + filename) as file:
        model = json.load(file)
    model_params, name = model[:-1], model[-1]
    dist_names, params, params_fix, weights = [
        list(x)
        for x in zip(*sorted(zip(*model[:-1]), key=lambda y: y[1]['scale']))
    ]
    dists = [dists_dict[dist_name] for dist_name in dist_names]
    mixmod = MixtureModel(dists, params, params_fix, weights, name)

    # Calculate quantiles
    _, feature = dirpath.split('/')
    q_obs = pics_lt.loc[pics_lt[feature] != 0, feature].sort_values()
    q_dist = [
        mixmod.ppf((i - 0.5) / len(q_obs)) for i in range(1,
                                                          len(q_obs) + 1)
    ]

    for cutoff in [5, 2, 1, 0]:
        if cutoff == 0:
            sliver = slice(None)  # Slices do not behave nicely with zeroes
        else:
            idx = ceil(cutoff * len(q_obs) /
                       100)  # Ceil ensures the cutoff fraction is removed
            sliver = slice(idx, -idx)

        # Plot quantiles
        fig, ax = plt.subplots()
        ax.scatter(q_dist[sliver], q_obs[sliver], s=10, edgecolors='none')
        ax.plot(ax.get_xlim(), ax.get_xlim(), color='k',
                linewidth=1)  # Plot y=x line
        ax.set_xlabel('Theoretical Quantiles')
        ax.set_ylabel('Observed Quantiles')
        ax.set_title(f'{feature}: {name} with {cutoff}% Trimmed Tails')
        plt.savefig(f'out/{feature}/qq_{cutoff}_{name}.png')
        plt.close()

        # Plot histograms
        fig, ax = plt.subplots()
        ax.hist(q_obs[sliver],
                bins=50,
                density=True,
                color='white',
                linewidth=1,
                edgecolor='black')
        x = linspace(min(q_obs[sliver]), max(q_obs[sliver]), 1000)
        for i in range(len(mixmod.dists)):
            ax.plot(x, mixmod.pdf_comp(x, comp=i), label=mixmod.dists[i].name)
        ax.plot(x, mixmod.pdf(x), label='total')
        ax.set_xlabel(feature)
        ax.set_ylabel('Density')
        ax.set_title(f'{feature}: {name} with {cutoff}% Trimmed Tails')
        ax.legend()
        plt.savefig(f'out/{feature}/hist_{cutoff}_{name}.png')
        plt.close()
Ejemplo n.º 2
0
def fit_model(rates, feature, model):
    # Filter data and unpack variables
    data = rates.loc[rates[feature] != 0, feature].sort_values()
    name, dists = model

    # Make output directories for feature models
    cur_dir = f'out/{feature}/'
    if not os.path.exists(cur_dir):
        os.makedirs(cur_dir)  # Recursive folder creation

    # Get maxes for generation of random initials
    rand_maxes = []
    for dist in dists:
        cfe = cfes[dist.name]  # Get closed-form estimator
        n = max(1, int(len(data) * random()))
        sample_params = pd.DataFrame(
            [cfe(np.random.choice(data, n)) for _ in range(500)])
        rand_max = (sample_params.mean() +
                    num_std * sample_params.std()).to_dict()
        rand_maxes.append(rand_max)

    results = []
    excepts = []  # For numerical exceptions (nan, inf)
    while len(results) < num_init:
        mixmod = MixtureModel(dists,
                              name=name,
                              params=get_rand_params(rand_maxes))
        try:
            n, ll = mixmod.fit(data, max_iter=1000)
        except RuntimeError:  # Catch RuntimeErrors from failure to converge
            pass

        # Store output based on ll value
        if np.isnan(ll) or np.isinf(ll):
            excepts.append((n, ll, mixmod))
        else:
            results.append((n, ll, mixmod))

    # Store model with largest log-likelihood
    _, _, mixmod_max = max(results, key=lambda x: x[1])
    model = ([dist.name for dist in mixmod_max.dists], mixmod_max.params,
             mixmod_max.params_fix, mixmod_max.weights, name)
    with open(cur_dir + f'model_{name}.json', 'w') as file:
        json.dump(model, file)

    # Store EM metadata
    results.extend(excepts)
    ns, lls, cons = zip(*[(n, ll, mixmod.converged)
                          for n, ll, mixmod in results])
    df = pd.DataFrame.from_dict({'n_iter': ns, 'll': lls, 'converged': cons})
    df.to_csv(cur_dir + f'meta_{name}.tsv', sep='\t', na_rep='nan')
Ejemplo n.º 3
0
for feature in os.listdir('out'):
    model_paths = [
        x for x in os.listdir('out/' + feature) if x.endswith('.json')
    ]
    fig, axs = plt.subplots(len(model_paths), 1, figsize=(6, 6))
    fig.subplots_adjust(top=0.875, bottom=0.075)
    fig.suptitle(
        f'{feature}:\nPosterior Probabilities of Mixture Model Components',
        y=0.95,
        size=10)
    for i, model_path in enumerate(model_paths):
        # Load model
        with open('/'.join(['out', feature, model_path])) as file:
            dist_names, params, params_fix, weights, name = json.load(file)
        dists = [dists_dict[dist_name] for dist_name in dist_names]
        mixmod = MixtureModel(dists, params, params_fix, weights, name)

        # Create heatmap
        data = rates.loc[rates[feature] != 0, feature].sort_values()
        expts = mixmod.posterior(data)

        # Plot heatmap
        ax = axs[i]
        ax.imshow(expts,
                  vmin=0,
                  vmax=1,
                  aspect='auto',
                  extent=[0, len(data), 0, len(mixmod.dists)])
        ax.set_aspect(0.035 * len(data))
        ax.set_title(mixmod.name, size=8)
        ax.tick_params(labelsize=7.5)
Ejemplo n.º 4
0
# Make output directory
if not os.path.exists('out/'):
    os.mkdir('out/')

rates = {}
counts = {}
fracs = {}
for feature in pics_lt:
    # Load model, sorting components by scale and instantiating distributions from names
    with open(f'../mixture_pic/out/{feature}/model_{model_paths[feature]}.json') as file:
        model = json.load(file)
    model_params, name = model[:-1], model[-1]
    dist_names, params, params_fix, weights = [list(x) for x in zip(*sorted(zip(*model[:-1]), key=lambda y: y[1]['scale']))]
    dists = [dists_dict[dist_name] for dist_name in dist_names]
    mixmod = MixtureModel(dists, params, params_fix, weights, name)

    # Remove extremes
    raw = pics_lt[feature]
    idx = np.flatnonzero(mixmod.posterior(raw)[-1] < thresh)
    clean = raw[idx]
    rates[feature] = (clean ** 2).groupby('block_id').mean()

    # Count extremes
    counts[feature] = len(raw) - len(clean)
    fracs[feature] = (len(raw) - len(clean)) / (len(raw) - (raw == 0).sum())  # Fraction of non-zero contrasts

    # Plot histograms of contrast counts in each block
    y = clean.groupby('block_id').count().value_counts()
    plt.bar(y.index, y)
    plt.title(f'{feature}: Contrast Counts in Blocks')