def __init__(self, opt): self.seq_per_img = opt.seq_per_img self.vocab_size = opt.vocab_size if opt.stratify_reward: # sampler = r self.tau = opt.tau_sent self.prefix = 'rhamm_sim' else: # sampler = q self.tau = opt.tau_sent_q self.prefix = 'qhamm_sim' # substitution options self.limited = opt.limited_vocab_sub self.tau_word = opt.tau_word # Load the similarity matrix: M = pl(opt.similarity_matrix) if opt.promote_rarity: IDF = pl(opt.rarity_matrix) M -= self.tau_word * opt.rare_tfidf * IDF M = M.astype(np.float32) n, d = M.shape print('Sim matrix:', n, 'x', d, ' V=', opt.vocab_size) assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape' self.words_distribs = M self.version = 'Hamming-Sim (Vpool=%d, tau=%.2f)' % (self.limited, self.tau)
def __init__(self, opt): super().__init__() self.logger = opt.logger self.seq_per_img = opt.seq_per_img self.margin_sim = opt.margin_sim self.normalize_batch = opt.normalize_batch self.use_cooc = opt.use_cooc self.penalize_confidence = opt.penalize_confidence #FIXME if self.margin_sim: self.logger.warn('Clipping similarities below %.2f' % self.margin_sim) self.limited = opt.limited_vocab_sim self.alpha = opt.alpha_word self.tau_word = opt.tau_word # Load the similarity matrix: M = pl(opt.similarity_matrix) if not self.use_cooc: # deprecated M = M - 1 # = -D_ij if opt.promote_rarity: IDF = pl(opt.rarity_matrix) M -= self.tau_word * opt.promote_rarity * IDF M = M.astype(np.float32) n, d = M.shape print('Sim matrix:', n, 'x', d, ' V=', opt.vocab_size) assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape' self.vocab_size = opt.vocab_size M = Variable(torch.from_numpy(M)).cuda() self.Sim_Matrix = M
def validate_dirichlet_sample(Ks = [2,5,10,20,50,100],N=1000): alphas = [10**i for i in interpolate(-5,0,100)] for K in Ks: print K plt.scatter(*pl(lambda a:mean(h_np(dirichlet_sample(K,a)) for i in xrange(N)),alphas)) plt.plot(*pl(lambda alpha:expected_entropy(K,alpha=alpha),alphas),label="%s pred" % K) plt.xlabel("alpha") plt.ylabel("Entropy (bits)") plt.semilogx() plt.legend()
def mu_approx_fig(filename=None): sigma = 1 L = 10 copy_range = np.linspace(1,10**5,100) plt.plot(*pl(lambda copy_num:mu_from(G,sigma,L,copy_num=copy_num),copy_range),label="Exact") plt.plot(*pl(lambda copy_num:approx_mu(G,sigma,L,copy_num=copy_num),copy_range),label="Approx") plt.xlabel("Copy number") plt.ylabel("$\mu$") plt.semilogx() plt.legend(loc='ul') plt.title("Exact vs. Approximate Chemical Potential") maybesave(filename)
def main(): G = 1000 mu_ep = 0 sigma_ep = 1 eps = gaussians(mu_ep,sigma_ep,G) mus = interpolate(-100,10,100) plt.plot(*pl(lambda mu:mean_occ(eps,mu),mus),label="Mean occ") plt.plot(*pl(lambda mu:G/(1+exp(-0.75*mu)),mus),label="predicted occ") plt.plot(*pl(lambda mu:sd_occ(eps,mu),mus),label="Sd occ") plt.plot(*pl(lambda mu:entropy(eps,mu),mus),label="Entropy (bits)") plt.plot([mu_ep,mu_ep],[0,G],linestyle='--') plt.plot([mus[0],mus[-1]],[G/2,G/2],linestyle='--') plt.xlabel("mu") plt.legend() plt.show()
def mu_approx_fig(filename=None): sigma = 1 L = 10 copy_range = np.linspace(1, 10**5, 100) plt.plot(*pl(lambda copy_num: mu_from(G, sigma, L, copy_num=copy_num), copy_range), label="Exact") plt.plot(*pl(lambda copy_num: approx_mu(G, sigma, L, copy_num=copy_num), copy_range), label="Approx") plt.xlabel("Copy number") plt.ylabel("$\mu$") plt.semilogx() plt.legend(loc='ul') plt.title("Exact vs. Approximate Chemical Potential") maybesave(filename)
def mu_summary_stat_experiment(): """Can we correlate copy number with a summary statistic?""" trials = 100 ep_mu = -2 ep_sigma = 5 G = 100 ts = [] copies = [] eps = [random.gauss(ep_mu,ep_sigma) for i in range(G)] mus = interpolate(-10,10,1000) eta = mean(eps) gamma = 1.0/variance(eps) print gamma plt.plot(*pl(lambda mu:mean_occ(eps,mu),mus)) plt.plot(*pl(lambda mu:G*fd(eta,mu,beta=gamma),mus)) plt.plot(*pl(lambda x:G/2.0,mus))
def L_vs_sigma_plot(filename=None, with_bio=False): if with_bio: tfdf = extract_motif_object_from_tfdf() motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ls = [len(motif[0]) for motif in motifs] cs = [len(motif) for motif in motifs] ics = [motif_ic(motif) for motif in motifs] ic_density = [ic / L for ic, L in zip(ics, Ls)] sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs] ginis = [motif_gini(motif, correct=False) for motif in motifs] mi_density = [ total_motif_mi(motif) / choose(L, 2) for motif, L in zip(motifs, Ls) ] min_sigma = 0.1 max_sigma = 10 plt.xlim(0, max_sigma) plt.ylim(0, 60) plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)), label="Binding Transition") plt.plot([min_sigma, max_sigma], [log(G, 2) / 2, log(G, 2) / 2], linestyle='--', label="Info Theory Threshold") # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)), # linestyle='--',label="Zero Discrimination Asymptote") if with_bio: plt.scatter(sigmas, Ls, label="Biological Motifs") plt.xlabel("sigma") plt.ylabel("L") plt.legend() maybesave(filename)
def plot_h_vs_ic(L,sigmas=interpolate(0.1,10,100),max_h=None,M=None,xfunc=lambda ps:2*L): if max_h is None: print "generating samples" pss = [simplexify_sample(4**L,sigma=sigma) for sigma in tqdm(sigmas)] else: pss = [] while len(pss) < trials: ps = sample(L) if h(ps) < max_h: pss.append(ps) print len(pss) print "computing M" if M is None: M = marginalization_matrix(L) icq_s = map(lambda ps:ic(ps,M),tqdm(pss)) print "computing entropy" icp_s = map(lambda ps:2*L - h_np(ps),tqdm(pss)) # print "computing total mi" # mis = map(lambda ps:total_mi(ps,M),tqdm(pss)) # print "computing columnwise entropies" # hqs = map(lambda ps:psfm_entropy(ps,M),tqdm(pss)) # plt.scatter(hs,hqs) plt.scatter(icp_s,icq_s) #plt.plot([0,2*L],[2*L,0]) #plt.plot([0,2*L],[0,2*L]) # plt.plot([0,2],[0,4]) # plt.plot([0,2],[0,2*L]) # print pearsonr(ics,hs) # print spearmanr(ics,hs) plt.plot([0,2*L],[0,2*L]) plt.plot(*pl(lambda icp:L*icp+2*(L-L**2),[2*(L-1),2*L]),color='b') plt.xlabel("Distribution IC") plt.ylabel("PSFM IC") plt.title("Distribution vs. Columnwise IC, Length=%s" % L)
def recover_infos(opt): infos = {} # Restart training (useful with oar idempotant) if opt.restart and osp.exists(osp.join(opt.modelname, 'model.pth')): opt.start_from_best = 0 opt.logger.warning('Picking up where we left') opt.start_from = osp.join(opt.modelname, 'model.pth') opt.infos_start_from = osp.join(opt.modelname, 'infos.pkl') opt.optimizer_start_from = osp.join(opt.modelname, 'optimizer.pth') infos = pl(opt.infos_start_from) elif opt.start_from is not None: # open old infos and check if models are compatible # start_from of the config file is a folder name opt.logger.warn('Starting from %s' % opt.start_from) if opt.start_from_best: flag = '-best' opt.logger.warn('Starting from the best saved model') else: flag = '' opt.infos_start_from = osp.join(opt.start_from, 'infos%s.pkl' % flag) opt.optimizer_start_from = osp.join(opt.start_from, 'optimizer%s.pth' % flag) opt.start_from = osp.join(opt.start_from, 'model%s.pth' % flag) infos = pl(opt.infos_start_from) saved_model_opt = infos['opt'] need_be_same = [ "model", "rnn_size_src", "rnn_size_trg", "num_layers_src", "num_layers_trg" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme],\ "Command line argument and saved model disagree on '%s' " % checkme # Recover iteration index iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) history = {} history['val_perf'] = infos.get('val_result_history', {}) val_losses = [] history['loss'] = infos.get('loss_history', {}) history['lr'] = infos.get('lr_history', {}) history['ss_prob'] = infos.get('ss_prob_history', {}) history['scores_stats'] = infos.get('scores_stats', {}) return iteration, epoch, opt, infos, history
def __init__(self, opt): super().__init__() self.logger = opt.logger self.seq_per_img = opt.seq_per_img self.margin_sim = opt.margin_sim self.normalize_batch = opt.normalize_batch self.use_cooc = opt.use_cooc self.penalize_confidence = opt.penalize_confidence #FIXME if self.margin_sim: self.logger.warn('Clipping similarities below %.2f' % self.margin_sim) self.limited = opt.limited_vocab_sim self.alpha = opt.alpha_word self.tau_word = opt.tau_word # Load the similarity matrix: M = pl(opt.similarity_matrix) self.dense = isinstance(M, np.ndarray) self.rare = opt.promote_rarity if self.dense: if not self.use_cooc: M = M - 1 # = -D_ij if opt.promote_rarity: IDF = pl(opt.rarity_matrix) M -= self.tau_word * opt.promote_rarity * IDF del IDF M = M.astype(np.float32) M = Variable(torch.from_numpy(M)).cuda() self.Sim_Matrix = M n, d = self.Sim_Matrix.size() else: if opt.promote_rarity: IDF = pl(opt.rarity_matrix) self.IDF = sparse_torch(IDF).cuda() del IDF self.Sim_Matrix = sparse_torch(M).cuda() n, d = self.Sim_Matrix.size() del M self.logger.info('Sim matrix: (%dx%d) & Vocab:%d' % (n, d, opt.vocab_size)) assert n == d and n == opt.vocab_size, 'Similarity matrix has incompatible shape' self.vocab_size = opt.vocab_size
def mean_squared_error(x, y, w): """ :param x: ciag wejsciowy Nx1 :param y: ciag wyjsciowy Nx1 :param w: parametry modelu (M+1)x1 :return: blad sredniokwadratowy pomiedzy wyjsciami y oraz wyjsciami uzyskanymi z wielowamiu o parametrach w dla wejsc x """ err = np.linalg.norm(y - pl(x, w), 2)**2 / len(x) return err
def make_sigma_infty_asymptote_figure(): Ls = range(1, 20) sigma = 100 plt.plot(*pl( lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls), label='Occupancy') plt.ylabel("Occupancy") plt.xlabel("Length") plt.plot([11.12, 11.12], [0, 1], linestyle='--', label='Predicted Critical Length') plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2") plt.legend(loc='upper left') plt.title("Mean Occupancy for sigma = 100") maybesave("sigma_infty_asymptote.png")
def length_vs_sigma(obj): lens = [] sigmas = [] def get_sigma(motif): pssm = make_pssm(motif) return mean(map(sd,pssm)) for tf in obj.tfs: motif = getattr(obj,tf) lens.append(len(motif[0])) sigmas.append(get_sigma(motif)) print pearsonr(lens,sigmas) print spearmanr(lens,sigmas) plt.scatter(sigmas,lens) plt.plot(*pl(length_from_sigma,np.linspace(0,100,1000))) plt.xlabel("Sigma") plt.ylabel("Length") return lens,sigmas
def __init__(self, opt): self.seq_per_img = opt.seq_per_img self.vocab_size = opt.vocab_size if opt.stratify_reward: # sampler = r self.tau = opt.tau_sent self.prefix = 'rhamm_sim' else: # sampler = q self.tau = opt.tau_sent_q self.prefix = 'qhamm_sim' # substitution options self.limited = opt.limited_vocab_sub self.tau_word = opt.tau_word self.unigram_disrtib = pl('data/coco/unigram_coco.distrib')[ 0] # FIXME save as 1D self.version = 'Hamming-Unigram (Vpool=%d, tau=%.2f)' % (self.limited, self.tau)
def entropy_drift_analysis(sigma=2, color='b', color_p='g'): """why is convergence so difficult to obtain for, say, sigma = 2? Explore selection/mutation balance.""" n = 16 L = 16 matrix = sample_matrix(L, sigma) ringer = ringer_motif(matrix, n) mutants = [ iterate(mutate_motif, ringer, i) for i in trange(256) for j in range(10) ] dists = [ motif_hamming_distance(ringer, mutant) for mutant in tqdm(mutants) ] fs = [log_fitness(matrix, mutant, G) for mutant in tqdm(mutants)] fps = [] trials = 100 for mutant in tqdm(mutants): nexts = [] f = log_fitness(matrix, mutant, G) for i in range(trials): mutant_p = mutate_motif(mutant) fp = log_fitness(matrix, mutant_p, G) if log(random.random()) < fp - f: nexts.append(fp) else: nexts.append(f) fps.append(mean(nexts)) plt.subplot(3, 1, 1) plt.scatter(dists, fs, color=color, marker='.') plt.scatter(dists, fps, color=color_p, marker='.') #plt.semilogy() plt.subplot(3, 1, 2) plt.scatter(dists, [(f - fp) / f for (f, fp) in zip(fs, fps)], color=color, marker='.') plt.plot([0, len(fs)], [0, 0], linestyle='--', color='black') plt.subplot(3, 1, 3) diffs = [fp - f for f, fp in zip(fs, fps)] plt.scatter(fs, diffs, marker='.', color=color) interpolant = poly1d(polyfit(fs, diffs, 1)) plt.plot(*pl(interpolant, [min(fs), max(fs)])) plt.plot([min(fs), max(fs)], [0, 0], linestyle='--', color='black') minx, maxx = min(fs + fs), max(fs + fps)
def recover_ens_infos(opt): infos = {} # Restart training (useful with oar idempotant) if opt.restart and osp.exists(osp.join( opt.ensemblename, 'model_0.pth')): # Fix the saving names opt.logger.warning('Picking up where we left') opt.start_from = glob.glob(opt.ensemblename + '/model_*.pth') opt.logger.debug('Loading saved models: %s' % str(opt.start_from)) opt.optimizer_start_from = opt.ensemblename + '/optimizer.pth' opt.cnn_start_from = glob.glob(opt.ensemblename + '/model-cnn_*.pth') opt.infos_start_from = glob.glob(opt.ensemblename + '/infos_*.pkl') infos = pl(osp.join(opt.ensemblename, 'infos.pkl')) if 'cnn_start_from' not in vars(opt): opt.start_from = [] opt.infos_start_from = [] opt.cnn_start_from = [] # Start from the top: if opt.start_from_best: # add best flag: flag = '-best' else: flag = '' opt.logger.debug('Starting from %s' % str(opt.model)) for e, m in enumerate(opt.model): m = m[0] opt.start_from.append('save/%s/model%s.pth' % (m, flag)) opt.infos_start_from.append("save/%s/infos%s.pkl" % (m, flag)) opt.cnn_start_from.append('save/%s/model-cnn%s.pth' % (m, flag)) copy2(opt.infos_start_from[-1], osp.join(opt.ensemblename, 'infos_%d.pkl' % e)) # Recover iteration index iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) history = {} history['val_perf'] = infos.get('val_result_history', {}) val_losses = [] history['loss'] = infos.get('loss_history', {}) history['lr'] = infos.get('lr_history', {}) history['ss_prob'] = infos.get('ss_prob_history', {}) return iteration, epoch, opt, infos, history
def make_sigma_0_figure(sigma=0.1, fname="sigma_0.png"): G = 5 * 10**6 def critical_L(sigma): return log(G) / (sigma * (1 - sigma / 2.0)) Lstar = critical_L(sigma) print "Lstar:", Lstar Ls = range(1, int(2 * Lstar)) plt.plot(*pl( lambda L: mean(occ2(sigma, L, G=5 * 10**6) for i in range(100)), Ls), label='Occupancy') plt.ylabel("Occupancy") plt.xlabel("Length") plt.plot([Lstar, Lstar], [0, 1], linestyle='--', label='Predicted Critical Length') plt.plot(Ls, [0.5] * len(Ls), linestyle='--', label="occ = 1/2") plt.legend(loc='upper left') plt.title("Mean Occupancy for sigma = %s" % sigma) maybesave(fname)
help='path to dump the _rarity_ matrix') parser.add_argument('--create_rare_matrix', action='store_true', help='create the rarity matrix for WORSxIDF') args = parser.parse_args() # define additional params: args.save_embed_matrix = "data/%s/%s.embed" % (args.data, args.embedding) args.save_embed_dict = "data/%s/%s.dict" % (args.data, args.embedding) args.data_info = 'data/%s/%s_trg.infos' % (args.data, args.trg_lang) args.data_stats = 'data/%s/vocab.%s.stats' % (args.data, args.trg_lang) args.save_sim = 'data/%s/%s.sim' % (args.data, args.embedding) args.save_rarity = 'data/%s/promote_rare.matrix' % (args.data) if len(args.embed_dict): E = pl(args.embed_dict) else: E = build_embed_dict(args.embed_txt) if len(args.save_embed_dict): # save for any eventual ulterior usage pd(E, args.save_embed_dict) ixtow = pl(args.data_info)['itow'] print("Preparing Glove embeddings matrix") embeddings = prepare_embeddings_dict(ixtow, E, output=args.save_embed_matrix) print("Preparing similarities matrix") sim = get_pairwise_distances(embeddings) print('Saiving the similarity matrix into ', args.save_sim) pd(sim.astype(np.float32), args.save_sim)
def make_ecoli_sigma_L_plot(): Ls = [] Ls_adj = [] ns = [] sigmas = [] labels = [] motif_ics = [] motif_ics_per_base = [] for tf in Escherichia_coli.tfs: sites = getattr(Escherichia_coli, tf) L = len(sites[0]) n = len(sites) ns.append(n) L_adj = len(sites[0]) + log2(n) sigma = mean((map(sd, make_pssm(sites)))) Ls.append(L) Ls_adj.append(L_adj) motif_ics.append(motif_ic(sites)) motif_ics_per_base.append(motif_ic(sites) / float(L)) sigmas.append(sigma) labels.append(tf) sigma_space = np.linspace(0.1, 3, 10) crit_lambs_actual = map( lambda sigma: critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100), tqdm(sigma_space)) plt.subplot(1, 6, 1) plt.scatter(sigmas, Ls) for L, sigma, label in zip(Ls, sigmas, labels): plt.annotate(label, xy=(sigma, L)) plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space)) plt.plot( *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space)) plt.plot(sigma_space, crit_lambs_actual) plt.subplot(1, 6, 2) plt.scatter(sigmas, Ls_adj) for L_adj, sigma, label in zip(Ls_adj, sigmas, labels): plt.annotate(label, xy=(sigma, L_adj)) plt.plot(*pl(lambda sigma: critical_lamb(sigma, G=5 * 10**6), sigma_space)) plt.plot( *pl(lambda sigma: critical_lamb(sigma, G=4.5 * 10**6), sigma_space)) plt.plot(sigma_space, crit_lambs_actual) preds = [critical_lamb(sigma, G=4.5 * 10**6) for sigma in tqdm(sigmas)] preds_actual = [ critical_lamb_actual(sigma, G=4.5 * 10**6, trials=100) for sigma in tqdm(sigmas) ] plt.subplot(1, 6, 3) plt.scatter(preds, Ls) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.title("Preds vs Ls") print "Preds vs Ls", pearsonr(preds, Ls) plt.plot([0, 30], [0, 30]) plt.subplot(1, 6, 4) plt.scatter(preds, Ls_adj) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds vs Ls_adj") print "Preds vs Ls_adj", pearsonr(preds, Ls_adj) plt.subplot(1, 6, 5) plt.scatter(preds_actual, Ls) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds_actual vs Ls") print "Preds_actual vs Ls", pearsonr(preds_actual, Ls) plt.subplot(1, 6, 6) plt.scatter(preds_actual, Ls_adj) plt.xlabel("Predicted Length") plt.ylabel("Observed Length") plt.plot([0, 30], [0, 30]) plt.title("Preds_actual vs Ls_adj") print "Preds_actual vs Ls_adj", pearsonr(preds_actual, Ls_adj) return Ls, sigmas
# # save for any eventual ulterior usage # pd(Glove, args.save_glove_dict) ixtow = json.load(open(args.coco_json, "r"))['ix_to_word'] # print("Preparing Glove embeddings matrix") # coco_gloves = prepare_embeddings_dict(ixtow, # Glove, # output='data/coco/glove_w15d512_coco_cocotalk.embed') # print("Preparing similarities matrix") # sim = get_pairwise_distances(coco_gloves) # print('Saiving the similarity matrix into ', args.save_sim) # pd(sim, args.save_sim) # Rarity matrix: print(ixtow['1'], ixtow['2'], ixtow['9487']) stats = pl(args.coco_stats) counts = stats['counts'] total_sentences = sum(list(stats['lengths'].values())) total_unk = sum([counts[w] for w in stats['bad words']]) freq = np.array([total_sentences] + [counts[ixtow[str(i)]] for i in range(1, len(ixtow))] + # UNK is not referenced [total_unk]) print('Frequencies:', freq.shape, 'min:', np.min(freq), 'max:', np.max(freq), "eos:", freq[0], "unk:", freq[-1]) F = freq.reshape(1, -1) F1 = np.dot(np.transpose(1 / F), F) F2 = np.dot(np.transpose(F), 1 / F) FF = np.minimum(F1, F2) print('FF:', FF.shape, 'min:', np.min(FF), 'max:', np.max(FF)) pd(FF.astype(np.float32), args.save_rarity)
def test(): f = lambda x: x plt.plot(*pl(lambda lamb: expect(f, lamb), np.linspace(-10, 10, 1000))) plt.plot( *pl(lambda lamb: diff_expect(f, lamb), np.linspace(-10, 10, 1000))) plt.show()