Example #1
0
 def __init__(self, opt):
     self.seq_per_img = opt.seq_per_img
     self.vocab_size = opt.vocab_size
     if opt.stratify_reward:
         # sampler = r
         self.tau = opt.tau_sent
         self.prefix = 'rhamm_sim'
     else:
         # sampler = q
         self.tau = opt.tau_sent_q
         self.prefix = 'qhamm_sim'
     # substitution options
     self.limited = opt.limited_vocab_sub
     self.tau_word = opt.tau_word
     # Load the similarity matrix:
     M = pl(opt.similarity_matrix)
     if opt.promote_rarity:
         IDF = pl(opt.rarity_matrix)
         M -= self.tau_word * opt.rare_tfidf * IDF
     M = M.astype(np.float32)
     n, d = M.shape
     print('Sim matrix:', n, 'x', d, ' V=', opt.vocab_size)
     assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape'
     self.words_distribs = M
     self.version = 'Hamming-Sim (Vpool=%d, tau=%.2f)' % (self.limited,
                                                          self.tau)
Example #2
0
 def __init__(self, opt):
     super().__init__()
     self.logger = opt.logger
     self.seq_per_img = opt.seq_per_img
     self.margin_sim = opt.margin_sim
     self.normalize_batch = opt.normalize_batch
     self.use_cooc = opt.use_cooc
     self.penalize_confidence = opt.penalize_confidence  #FIXME
     if self.margin_sim:
         self.logger.warn('Clipping similarities below %.2f' %
                          self.margin_sim)
     self.limited = opt.limited_vocab_sim
     self.alpha = opt.alpha_word
     self.tau_word = opt.tau_word
     # Load the similarity matrix:
     M = pl(opt.similarity_matrix)
     if not self.use_cooc:  # deprecated
         M = M - 1  # = -D_ij
     if opt.promote_rarity:
         IDF = pl(opt.rarity_matrix)
         M -= self.tau_word * opt.promote_rarity * IDF
     M = M.astype(np.float32)
     n, d = M.shape
     print('Sim matrix:', n, 'x', d, ' V=', opt.vocab_size)
     assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape'
     self.vocab_size = opt.vocab_size
     M = Variable(torch.from_numpy(M)).cuda()
     self.Sim_Matrix = M
Example #3
0
def validate_dirichlet_sample(Ks = [2,5,10,20,50,100],N=1000):
    alphas = [10**i for i in interpolate(-5,0,100)]
    for K in Ks:
        print K
        plt.scatter(*pl(lambda a:mean(h_np(dirichlet_sample(K,a)) for i in xrange(N)),alphas))
        plt.plot(*pl(lambda alpha:expected_entropy(K,alpha=alpha),alphas),label="%s pred" % K)
    plt.xlabel("alpha")
    plt.ylabel("Entropy (bits)")
    plt.semilogx()
    plt.legend()
def mu_approx_fig(filename=None):
    sigma = 1
    L = 10
    copy_range = np.linspace(1,10**5,100)
    plt.plot(*pl(lambda copy_num:mu_from(G,sigma,L,copy_num=copy_num),copy_range),label="Exact")
    plt.plot(*pl(lambda copy_num:approx_mu(G,sigma,L,copy_num=copy_num),copy_range),label="Approx")
    plt.xlabel("Copy number")
    plt.ylabel("$\mu$")
    plt.semilogx()
    plt.legend(loc='ul')
    plt.title("Exact vs. Approximate Chemical Potential")
    maybesave(filename)
Example #5
0
def main():
    G = 1000
    mu_ep = 0
    sigma_ep = 1
    eps = gaussians(mu_ep,sigma_ep,G)
    mus = interpolate(-100,10,100)
    plt.plot(*pl(lambda mu:mean_occ(eps,mu),mus),label="Mean occ")
    plt.plot(*pl(lambda mu:G/(1+exp(-0.75*mu)),mus),label="predicted occ")
    plt.plot(*pl(lambda mu:sd_occ(eps,mu),mus),label="Sd occ")
    plt.plot(*pl(lambda mu:entropy(eps,mu),mus),label="Entropy (bits)")
    plt.plot([mu_ep,mu_ep],[0,G],linestyle='--')
    plt.plot([mus[0],mus[-1]],[G/2,G/2],linestyle='--')
    plt.xlabel("mu")
    plt.legend()
    plt.show()
def mu_approx_fig(filename=None):
    sigma = 1
    L = 10
    copy_range = np.linspace(1, 10**5, 100)
    plt.plot(*pl(lambda copy_num: mu_from(G, sigma, L, copy_num=copy_num),
                 copy_range),
             label="Exact")
    plt.plot(*pl(lambda copy_num: approx_mu(G, sigma, L, copy_num=copy_num),
                 copy_range),
             label="Approx")
    plt.xlabel("Copy number")
    plt.ylabel("$\mu$")
    plt.semilogx()
    plt.legend(loc='ul')
    plt.title("Exact vs. Approximate Chemical Potential")
    maybesave(filename)
Example #7
0
def mu_summary_stat_experiment():
    """Can we correlate copy number with a summary statistic?"""
    trials = 100
    ep_mu = -2
    ep_sigma = 5
    G = 100
    ts = []
    copies = []
    eps = [random.gauss(ep_mu,ep_sigma) for i in range(G)]
    mus = interpolate(-10,10,1000)
    eta = mean(eps)
    gamma = 1.0/variance(eps)
    print gamma
    plt.plot(*pl(lambda mu:mean_occ(eps,mu),mus))
    plt.plot(*pl(lambda mu:G*fd(eta,mu,beta=gamma),mus))
    plt.plot(*pl(lambda x:G/2.0,mus))
Example #8
0
def L_vs_sigma_plot(filename=None, with_bio=False):
    if with_bio:
        tfdf = extract_motif_object_from_tfdf()
        motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
        Ls = [len(motif[0]) for motif in motifs]
        cs = [len(motif) for motif in motifs]
        ics = [motif_ic(motif) for motif in motifs]
        ic_density = [ic / L for ic, L in zip(ics, Ls)]
        sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs]
        ginis = [motif_gini(motif, correct=False) for motif in motifs]
        mi_density = [
            total_motif_mi(motif) / choose(L, 2)
            for motif, L in zip(motifs, Ls)
        ]
    min_sigma = 0.1
    max_sigma = 10
    plt.xlim(0, max_sigma)
    plt.ylim(0, 60)
    plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)),
             label="Binding Transition")
    plt.plot([min_sigma, max_sigma],
             [log(G, 2) / 2, log(G, 2) / 2],
             linestyle='--',
             label="Info Theory Threshold")
    # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)),
    #          linestyle='--',label="Zero Discrimination Asymptote")
    if with_bio:
        plt.scatter(sigmas, Ls, label="Biological Motifs")
    plt.xlabel("sigma")
    plt.ylabel("L")
    plt.legend()
    maybesave(filename)
Example #9
0
def plot_h_vs_ic(L,sigmas=interpolate(0.1,10,100),max_h=None,M=None,xfunc=lambda ps:2*L):
    if max_h is None:
        print "generating samples"
        pss = [simplexify_sample(4**L,sigma=sigma)
               for sigma in tqdm(sigmas)]
    else:
        pss = []
        while len(pss) < trials:
            ps = sample(L)
            if h(ps) < max_h:
                pss.append(ps)
                print len(pss)
    print "computing M"
    if M is None:
        M = marginalization_matrix(L)
    icq_s = map(lambda ps:ic(ps,M),tqdm(pss))
    print "computing entropy"
    icp_s = map(lambda ps:2*L - h_np(ps),tqdm(pss))
    # print "computing total mi"
    # mis = map(lambda ps:total_mi(ps,M),tqdm(pss))
    # print "computing columnwise entropies"
    # hqs = map(lambda ps:psfm_entropy(ps,M),tqdm(pss))
    # plt.scatter(hs,hqs)
    plt.scatter(icp_s,icq_s)
    #plt.plot([0,2*L],[2*L,0])
    #plt.plot([0,2*L],[0,2*L])
    # plt.plot([0,2],[0,4])
    # plt.plot([0,2],[0,2*L])
    # print pearsonr(ics,hs)
    # print spearmanr(ics,hs)
    plt.plot([0,2*L],[0,2*L])
    plt.plot(*pl(lambda icp:L*icp+2*(L-L**2),[2*(L-1),2*L]),color='b')
    plt.xlabel("Distribution IC")
    plt.ylabel("PSFM IC")
    plt.title("Distribution vs. Columnwise IC, Length=%s" % L)
Example #10
0
def recover_infos(opt):
    infos = {}
    # Restart training (useful with oar idempotant)
    if opt.restart and osp.exists(osp.join(opt.modelname, 'model.pth')):
        opt.start_from_best = 0
        opt.logger.warning('Picking up where we left')
        opt.start_from = osp.join(opt.modelname, 'model.pth')
        opt.infos_start_from = osp.join(opt.modelname, 'infos.pkl')
        opt.optimizer_start_from = osp.join(opt.modelname, 'optimizer.pth')
        infos = pl(opt.infos_start_from)

    elif opt.start_from is not None:
        # open old infos and check if models are compatible
        # start_from of the config file is a folder name
        opt.logger.warn('Starting from %s' % opt.start_from)
        if opt.start_from_best:
            flag = '-best'
            opt.logger.warn('Starting from the best saved model')
        else:
            flag = ''
        opt.infos_start_from = osp.join(opt.start_from, 'infos%s.pkl' % flag)
        opt.optimizer_start_from = osp.join(opt.start_from,
                                            'optimizer%s.pth' % flag)
        opt.start_from = osp.join(opt.start_from, 'model%s.pth' % flag)

        infos = pl(opt.infos_start_from)
        saved_model_opt = infos['opt']
        need_be_same = [
            "model", "rnn_size_src", "rnn_size_trg", "num_layers_src",
            "num_layers_trg"
        ]
        for checkme in need_be_same:
            assert vars(saved_model_opt)[checkme] == vars(opt)[checkme],\
                    "Command line argument and saved model disagree on '%s' " % checkme

    # Recover iteration index
    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    history = {}
    history['val_perf'] = infos.get('val_result_history', {})
    val_losses = []
    history['loss'] = infos.get('loss_history', {})
    history['lr'] = infos.get('lr_history', {})
    history['ss_prob'] = infos.get('ss_prob_history', {})
    history['scores_stats'] = infos.get('scores_stats', {})

    return iteration, epoch, opt, infos, history
Example #11
0
 def __init__(self, opt):
     super().__init__()
     self.logger = opt.logger
     self.seq_per_img = opt.seq_per_img
     self.margin_sim = opt.margin_sim
     self.normalize_batch = opt.normalize_batch
     self.use_cooc = opt.use_cooc
     self.penalize_confidence = opt.penalize_confidence  #FIXME
     if self.margin_sim:
         self.logger.warn('Clipping similarities below %.2f' %
                          self.margin_sim)
     self.limited = opt.limited_vocab_sim
     self.alpha = opt.alpha_word
     self.tau_word = opt.tau_word
     # Load the similarity matrix:
     M = pl(opt.similarity_matrix)
     self.dense = isinstance(M, np.ndarray)
     self.rare = opt.promote_rarity
     if self.dense:
         if not self.use_cooc:
             M = M - 1  # = -D_ij
         if opt.promote_rarity:
             IDF = pl(opt.rarity_matrix)
             M -= self.tau_word * opt.promote_rarity * IDF
             del IDF
         M = M.astype(np.float32)
         M = Variable(torch.from_numpy(M)).cuda()
         self.Sim_Matrix = M
         n, d = self.Sim_Matrix.size()
     else:
         if opt.promote_rarity:
             IDF = pl(opt.rarity_matrix)
             self.IDF = sparse_torch(IDF).cuda()
             del IDF
         self.Sim_Matrix = sparse_torch(M).cuda()
         n, d = self.Sim_Matrix.size()
     del M
     self.logger.info('Sim matrix: (%dx%d) & Vocab:%d' %
                      (n, d, opt.vocab_size))
     assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape'
     self.vocab_size = opt.vocab_size
Example #12
0
def mean_squared_error(x, y, w):
    """
    :param x: ciag wejsciowy Nx1
    :param y: ciag wyjsciowy Nx1
    :param w: parametry modelu (M+1)x1
    :return: blad sredniokwadratowy pomiedzy wyjsciami y
    oraz wyjsciami uzyskanymi z wielowamiu o parametrach w dla wejsc x
    """

    err = np.linalg.norm(y - pl(x, w), 2)**2 / len(x)

    return err
def make_sigma_infty_asymptote_figure():
    Ls = range(1, 20)
    sigma = 100
    plt.plot(*pl(
        lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls),
             label='Occupancy')
    plt.ylabel("Occupancy")
    plt.xlabel("Length")
    plt.plot([11.12, 11.12], [0, 1],
             linestyle='--',
             label='Predicted Critical Length')
    plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2")
    plt.legend(loc='upper left')
    plt.title("Mean Occupancy for sigma = 100")
    maybesave("sigma_infty_asymptote.png")
def length_vs_sigma(obj):
    lens = []
    sigmas = []
    def get_sigma(motif):
        pssm = make_pssm(motif)
        return mean(map(sd,pssm))
    for tf in obj.tfs:
        motif = getattr(obj,tf)
        lens.append(len(motif[0]))
        sigmas.append(get_sigma(motif))
    print pearsonr(lens,sigmas)
    print spearmanr(lens,sigmas)
    plt.scatter(sigmas,lens)
    plt.plot(*pl(length_from_sigma,np.linspace(0,100,1000)))
    plt.xlabel("Sigma")
    plt.ylabel("Length")
    return lens,sigmas
Example #15
0
 def __init__(self, opt):
     self.seq_per_img = opt.seq_per_img
     self.vocab_size = opt.vocab_size
     if opt.stratify_reward:
         # sampler = r
         self.tau = opt.tau_sent
         self.prefix = 'rhamm_sim'
     else:
         # sampler = q
         self.tau = opt.tau_sent_q
         self.prefix = 'qhamm_sim'
     # substitution options
     self.limited = opt.limited_vocab_sub
     self.tau_word = opt.tau_word
     self.unigram_disrtib = pl('data/coco/unigram_coco.distrib')[
         0]  # FIXME save as 1D
     self.version = 'Hamming-Unigram (Vpool=%d, tau=%.2f)' % (self.limited,
                                                              self.tau)
def entropy_drift_analysis(sigma=2, color='b', color_p='g'):
    """why is convergence so difficult to obtain for, say, sigma = 2?  Explore selection/mutation balance."""
    n = 16
    L = 16
    matrix = sample_matrix(L, sigma)
    ringer = ringer_motif(matrix, n)
    mutants = [
        iterate(mutate_motif, ringer, i) for i in trange(256)
        for j in range(10)
    ]
    dists = [
        motif_hamming_distance(ringer, mutant) for mutant in tqdm(mutants)
    ]
    fs = [log_fitness(matrix, mutant, G) for mutant in tqdm(mutants)]
    fps = []
    trials = 100
    for mutant in tqdm(mutants):
        nexts = []
        f = log_fitness(matrix, mutant, G)
        for i in range(trials):
            mutant_p = mutate_motif(mutant)
            fp = log_fitness(matrix, mutant_p, G)
            if log(random.random()) < fp - f:
                nexts.append(fp)
            else:
                nexts.append(f)
        fps.append(mean(nexts))
    plt.subplot(3, 1, 1)
    plt.scatter(dists, fs, color=color, marker='.')
    plt.scatter(dists, fps, color=color_p, marker='.')
    #plt.semilogy()
    plt.subplot(3, 1, 2)
    plt.scatter(dists, [(f - fp) / f for (f, fp) in zip(fs, fps)],
                color=color,
                marker='.')
    plt.plot([0, len(fs)], [0, 0], linestyle='--', color='black')
    plt.subplot(3, 1, 3)
    diffs = [fp - f for f, fp in zip(fs, fps)]
    plt.scatter(fs, diffs, marker='.', color=color)
    interpolant = poly1d(polyfit(fs, diffs, 1))
    plt.plot(*pl(interpolant, [min(fs), max(fs)]))
    plt.plot([min(fs), max(fs)], [0, 0], linestyle='--', color='black')
    minx, maxx = min(fs + fs), max(fs + fps)
Example #17
0
def recover_ens_infos(opt):
    infos = {}
    # Restart training (useful with oar idempotant)
    if opt.restart and osp.exists(osp.join(
            opt.ensemblename, 'model_0.pth')):  # Fix the saving names
        opt.logger.warning('Picking up where we left')
        opt.start_from = glob.glob(opt.ensemblename + '/model_*.pth')
        opt.logger.debug('Loading saved models: %s' % str(opt.start_from))
        opt.optimizer_start_from = opt.ensemblename + '/optimizer.pth'
        opt.cnn_start_from = glob.glob(opt.ensemblename + '/model-cnn_*.pth')
        opt.infos_start_from = glob.glob(opt.ensemblename + '/infos_*.pkl')
        infos = pl(osp.join(opt.ensemblename, 'infos.pkl'))
    if 'cnn_start_from' not in vars(opt):
        opt.start_from = []
        opt.infos_start_from = []
        opt.cnn_start_from = []
        # Start from the top:
        if opt.start_from_best:
            # add best flag:
            flag = '-best'
        else:
            flag = ''
        opt.logger.debug('Starting from %s' % str(opt.model))
        for e, m in enumerate(opt.model):
            m = m[0]
            opt.start_from.append('save/%s/model%s.pth' % (m, flag))
            opt.infos_start_from.append("save/%s/infos%s.pkl" % (m, flag))
            opt.cnn_start_from.append('save/%s/model-cnn%s.pth' % (m, flag))
            copy2(opt.infos_start_from[-1],
                  osp.join(opt.ensemblename, 'infos_%d.pkl' % e))

    # Recover iteration index
    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    history = {}
    history['val_perf'] = infos.get('val_result_history', {})
    val_losses = []
    history['loss'] = infos.get('loss_history', {})
    history['lr'] = infos.get('lr_history', {})
    history['ss_prob'] = infos.get('ss_prob_history', {})
    return iteration, epoch, opt, infos, history
def make_sigma_0_figure(sigma=0.1, fname="sigma_0.png"):
    G = 5 * 10**6

    def critical_L(sigma):
        return log(G) / (sigma * (1 - sigma / 2.0))

    Lstar = critical_L(sigma)
    print "Lstar:", Lstar
    Ls = range(1, int(2 * Lstar))
    plt.plot(*pl(
        lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls),
             label='Occupancy')
    plt.ylabel("Occupancy")
    plt.xlabel("Length")
    plt.plot([Lstar, Lstar], [0, 1],
             linestyle='--',
             label='Predicted Critical Length')
    plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2")
    plt.legend(loc='upper left')
    plt.title("Mean Occupancy for sigma = %s" % sigma)
    maybesave(fname)
Example #19
0
                        help='path to dump the _rarity_ matrix')
    parser.add_argument('--create_rare_matrix',
                        action='store_true',
                        help='create the rarity matrix for WORSxIDF')

    args = parser.parse_args()
    # define additional params:
    args.save_embed_matrix = "data/%s/%s.embed" % (args.data, args.embedding)
    args.save_embed_dict = "data/%s/%s.dict" % (args.data, args.embedding)
    args.data_info = 'data/%s/%s_trg.infos' % (args.data, args.trg_lang)
    args.data_stats = 'data/%s/vocab.%s.stats' % (args.data, args.trg_lang)
    args.save_sim = 'data/%s/%s.sim' % (args.data, args.embedding)
    args.save_rarity = 'data/%s/promote_rare.matrix' % (args.data)

    if len(args.embed_dict):
        E = pl(args.embed_dict)
    else:
        E = build_embed_dict(args.embed_txt)
        if len(args.save_embed_dict):
            # save for any eventual ulterior usage
            pd(E, args.save_embed_dict)

    ixtow = pl(args.data_info)['itow']
    print("Preparing Glove embeddings matrix")
    embeddings = prepare_embeddings_dict(ixtow,
                                         E,
                                         output=args.save_embed_matrix)
    print("Preparing similarities matrix")
    sim = get_pairwise_distances(embeddings)
    print('Saiving the similarity matrix into ', args.save_sim)
    pd(sim.astype(np.float32), args.save_sim)
def make_ecoli_sigma_L_plot():
    Ls = []
    Ls_adj = []
    ns = []
    sigmas = []
    labels = []
    motif_ics = []
    motif_ics_per_base = []
    for tf in Escherichia_coli.tfs:
        sites = getattr(Escherichia_coli, tf)
        L = len(sites[0])
        n = len(sites)
        ns.append(n)
        L_adj = len(sites[0]) + log2(n)
        sigma = mean((map(sd, make_pssm(sites))))
        Ls.append(L)
        Ls_adj.append(L_adj)
        motif_ics.append(motif_ic(sites))
        motif_ics_per_base.append(motif_ic(sites) / float(L))
        sigmas.append(sigma)
        labels.append(tf)
    sigma_space = np.linspace(0.1, 3, 10)
    crit_lambs_actual = map(
        lambda sigma: critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100),
        tqdm(sigma_space))
    plt.subplot(1, 6, 1)
    plt.scatter(sigmas, Ls)
    for L, sigma, label in zip(Ls, sigmas, labels):
        plt.annotate(label, xy=(sigma, L))
    plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space))
    plt.plot(
        *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space))
    plt.plot(sigma_space, crit_lambs_actual)
    plt.subplot(1, 6, 2)
    plt.scatter(sigmas, Ls_adj)
    for L_adj, sigma, label in zip(Ls_adj, sigmas, labels):
        plt.annotate(label, xy=(sigma, L_adj))
    plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space))
    plt.plot(
        *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space))
    plt.plot(sigma_space, crit_lambs_actual)
    preds = [critical_lamb(sigma, G=4.5 * 10**6) for sigma in tqdm(sigmas)]
    preds_actual = [
        critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100)
        for sigma in tqdm(sigmas)
    ]
    plt.subplot(1, 6, 3)
    plt.scatter(preds, Ls)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.title("Preds vs Ls")
    print "Preds vs Ls", pearsonr(preds, Ls)
    plt.plot([0, 30], [0, 30])
    plt.subplot(1, 6, 4)
    plt.scatter(preds, Ls_adj)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds vs Ls_adj")
    print "Preds vs Ls_adj", pearsonr(preds, Ls_adj)
    plt.subplot(1, 6, 5)
    plt.scatter(preds_actual, Ls)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds_actual vs Ls")
    print "Preds_actual vs Ls", pearsonr(preds_actual, Ls)
    plt.subplot(1, 6, 6)
    plt.scatter(preds_actual, Ls_adj)
    plt.xlabel("Predicted Length")
    plt.ylabel("Observed Length")
    plt.plot([0, 30], [0, 30])
    plt.title("Preds_actual vs Ls_adj")
    print "Preds_actual vs Ls_adj", pearsonr(preds_actual, Ls_adj)
    return Ls, sigmas
Example #21
0
    # # save for any eventual ulterior usage
    # pd(Glove, args.save_glove_dict)

    ixtow = json.load(open(args.coco_json, "r"))['ix_to_word']
    # print("Preparing Glove embeddings matrix")
    # coco_gloves = prepare_embeddings_dict(ixtow,
    # Glove,
    # output='data/coco/glove_w15d512_coco_cocotalk.embed')
    # print("Preparing similarities matrix")
    # sim = get_pairwise_distances(coco_gloves)
    # print('Saiving the similarity matrix into ', args.save_sim)
    # pd(sim, args.save_sim)

    # Rarity matrix:
    print(ixtow['1'], ixtow['2'], ixtow['9487'])
    stats = pl(args.coco_stats)
    counts = stats['counts']
    total_sentences = sum(list(stats['lengths'].values()))
    total_unk = sum([counts[w] for w in stats['bad words']])
    freq = np.array([total_sentences] +
                    [counts[ixtow[str(i)]]
                     for i in range(1, len(ixtow))] +  # UNK is not referenced
                    [total_unk])
    print('Frequencies:', freq.shape, 'min:', np.min(freq), 'max:',
          np.max(freq), "eos:", freq[0], "unk:", freq[-1])
    F = freq.reshape(1, -1)
    F1 = np.dot(np.transpose(1 / F), F)
    F2 = np.dot(np.transpose(F), 1 / F)
    FF = np.minimum(F1, F2)
    print('FF:', FF.shape, 'min:', np.min(FF), 'max:', np.max(FF))
    pd(FF.astype(np.float32), args.save_rarity)
Example #22
0
def test():
    f = lambda x: x
    plt.plot(*pl(lambda lamb: expect(f, lamb), np.linspace(-10, 10, 1000)))
    plt.plot(
        *pl(lambda lamb: diff_expect(f, lamb), np.linspace(-10, 10, 1000)))
    plt.show()