def train_zero_saia_longlist(word_list=noun_list): with open('../indata/saia_zeroshot_nounslong_splits.json', 'r') as f: ttsplit = json.load(f) w2v = linwac.load_w2v() X = np.load('../indata/saiapr.npz') Xsaia = X['arr_0'] print "Xsaia", Xsaia.shape W = np.load('../indata/saiapr_wmat.npz') Wsaia = W['arr_0'] print "Wsaia", Wsaia.shape long_noun_list = [ l.strip() for l in open('noun_list_long.txt').readlines() ] long_noun_ind = [msim.word2ind[n] for n in long_noun_list] for x, spl in enumerate(ttsplit): print "SPLIT", x if x > 0: Xsaia_train = Xsaia[spl['train']] Xsaia_test = Xsaia[spl['test']] Wsaia_t = Wsaia.transpose() Wsaia_t.shape Wsaia_t_train = Wsaia_t[spl['train']] Wsaia_train = Wsaia_t_train.transpose() print "Train linwac" linwac.train_all_nouns(Wsaia_train, Xsaia_train, w2v, ssim="500n_zeroshot_split" + str(x), word_list=long_noun_list) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, long_noun_ind, Wsaia_train, Xsaia_train, split="500n_zeroshot_split" + str(x)) print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=noun_list, ssim="nouns_zeroshot_split" + str(x))
def train_zero_mixed_plural_saia_models(): with open('../indata/saia_zeroshot_mixedpluralsplit.json', 'r') as f: ttsplit = json.load(f) w2v = linwac.load_w2v() X = np.load('../indata/saiapr.npz') Xsaia = X['arr_0'] print "Xsaia", Xsaia.shape W = np.load('../indata/saiapr_wmat.npz') Wsaia = W['arr_0'] print "Wsaia", Wsaia.shape Xsaia_train = Xsaia[ttsplit['train']] Xsaia_test = Xsaia[ttsplit['test']] Wsaia_t = Wsaia.transpose() Wsaia_t.shape Wsaia_t_train = Wsaia_t[ttsplit['train']] Wsaia_train = Wsaia_t_train.transpose() print "Plurals", ttsplit['nouns'] print "Singulars", ttsplit['singulars'] print "Train linwac" this_wordlist = ttsplit['nouns'] + ttsplit['singulars'] word_ind = [msim.word2ind[n] for n in this_wordlist] print "Wordlist", len(this_wordlist) linwac.train_all_nouns(Wsaia_train, Xsaia_train, w2v, ssim="_zeroshot_mixedpluralsplit", word_list=this_wordlist) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, word_ind, Wsaia_train, Xsaia_train, split="_zeroshot_mixedpluralsplit") print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=this_wordlist, ssim="nouns_zeroshot_mixedpluralsplit")
def train_zero_hypern_saia_models(): with open('../indata/saia_zeroshot_hypernsplit.json', 'r') as f: ttsplit = json.load(f) w2v = linwac.load_w2v() X = np.load('../indata/saiapr.npz') Xsaia = X['arr_0'] print "Xsaia", Xsaia.shape W = np.load('../indata/saiapr_wmat.npz') Wsaia = W['arr_0'] print "Wsaia", Wsaia.shape Xsaia_train = Xsaia[ttsplit['train']] Xsaia_test = Xsaia[ttsplit['test']] Wsaia_t = Wsaia.transpose() Wsaia_t.shape Wsaia_t_train = Wsaia_t[ttsplit['train']] Wsaia_train = Wsaia_t_train.transpose() print ttsplit['nouns'] print "Train linwac" this_wordlist = noun_list + [ n for n in ttsplit['nouns'] if not n in noun_list ] print "Wordlist", len(this_wordlist) linwac.train_all_nouns(Wsaia_train, Xsaia_train, w2v, ssim="_zeroshot_hypernsplit", word_list=this_wordlist) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, noun_ind, Wsaia_train, Xsaia_train, split="_zeroshot_hypernsplit") print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=noun_list, ssim="nouns_zeroshot_hypernsplit")
def train_zero_saia_models(word_list=noun_list): with open('../indata/saia_zeroshot_nounsplits.json', 'r') as f: ttsplit = json.load(f) w2v = linwac.load_w2v() X = np.load('../indata/saiapr.npz') Xsaia = X['arr_0'] print "Xsaia", Xsaia.shape W = np.load('../indata/saiapr_wmat.npz') Wsaia = W['arr_0'] print "Wsaia", Wsaia.shape for x, spl in enumerate(ttsplit): print "SPLIT", x if x > 0: Xsaia_train = Xsaia[spl['train']] Xsaia_test = Xsaia[spl['test']] Wsaia_t = Wsaia.transpose() Wsaia_t.shape Wsaia_t_train = Wsaia_t[spl['train']] Wsaia_train = Wsaia_t_train.transpose() print "Train linwac" linwac.train_all_nouns(Wsaia_train, Xsaia_train, w2v, ssim="_zeroshot_split" + str(x)) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, noun_ind, Wsaia_train, Xsaia_train, split="_zeroshot_split" + str(x)) print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=noun_list, ssim="nouns_zeroshot_split" + str(x))
def make_results_randomsplits(): with gzip.open('../indata/saia_zeroshot_nounsplits_testsets.pklz', 'r') as f: testsets = pickle.load(f) with open('../indata/saia_zeroshot_nounsplits.json', 'r') as f: ttsplit = json.load(f) print "testsets", len(testsets) w2v = linwac.load_w2v() nouns_w2v_mat = np.array([msim.w2v_vecs[msim.word2ind[n]] for n in NOUNS]) print "W2v matrix", nouns_w2v_mat.shape mapmodels = [ '../linmodels/linmap_nouns_zeroshot_split' + str(x) + '.pklz' for x in range(10) ] linmodels = [ '../linmodels/linwac_nouns_w2v_zeroshot_split' + str(x) + '.pklz' for x in range(10) ] logmodels = [ '../logmodels/logwac_saia_nouns_zeroshot_split' + str(x) + '_nosamp.pklz' for x in range(10) ] print "*****Eval mapmodels" res1 = eval_map_models(testsets[:10], ttsplit[:10], mapmodels, nouns_w2v_mat, NOUNS) print "*****Eval logmodels" res2 = eval_log_models(testsets[:10], ttsplit[:10], logmodels, nouns_w2v_mat, NOUNS) print "*****Eval linmodels" res3 = eval_lin_models(testsets[:10], ttsplit[:10], linmodels, nouns_w2v_mat, NOUNS) results = [] for (x, y, a, b, c, d) in res1 + res2 + res3: results.append((x, y, "%.2f" % (a * 100), "%.2f" % (b * 100), "%.2f" % (c * 100), "%.2f" % (d * 100))) df = pd.DataFrame(results, columns=['testset', 'model', '@1', '@2', '@5', '@10']) print df.to_latex(index=False)
def train_standard_plural_saia_models(): with open('../indata/saia_standard_pluralsplit.json', 'r') as f: ttsplit = json.load(f) print "Nouns", len(ttsplit['nouns']) w2v = linwac.load_w2v() Xsaia_t, Wsaia_t = linwac.load_saia_train() Xsaia_train = Xsaia_t[ttsplit['train']] Wsaia_tt = Wsaia_t.transpose() print Wsaia_tt.shape Wsaia_t_train = Wsaia_tt[ttsplit['train']] Wsaia_train = Wsaia_t_train.transpose() print "Train linwac" this_wordlist = ttsplit['nouns'] word_ind = [msim.word2ind[n] for n in this_wordlist] print "Wordlist", len(this_wordlist) linwac.train_all_nouns(Wsaia_train, Xsaia_train, w2v, ssim="_standard_pluralsplit", word_list=this_wordlist) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, word_ind, Wsaia_train, Xsaia_train, split="_standard_pluralsplit") print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=this_wordlist, ssim="nouns_standard_pluralsplit")
def train_zero_refcoco_models(): long_noun_list = [ l.strip() for l in open('noun_list_long.txt').readlines() ] long_noun_ind = [msim.word2ind[n] for n in long_noun_list] with open('../indata/refcoco_zeroshot_nounsplits.json', 'r') as f: ttsplit = json.load(f) w2v = linwac.load_w2v() X = np.load('../indata/mscoco.npz') Xcoco = X['arr_0'] print "Xcoco", Xcoco.shape W = np.load('../indata/mscoco_refcoco_wmat.npz') Wcoco = W['arr_0'] print "Wcoco", Wcoco.shape total = 0 for rx in range(Wcoco.shape[1]): if len(np.nonzero(Wcoco[:, rx])[0]): total += 1 print "Total", total with gzip.open('../indata/refcoco_refdf.pklz', 'r') as f: refdf = pickle.load(f) print "Regionids", len(set(refdf['region_id'])) for x, spl in enumerate(ttsplit): print "SPLIT", x if x == 2: Xcoco_train = Xcoco[spl['train']] Xcoco_test = Xcoco[spl['test']] print "Xcoco", Xcoco_train.shape Wcoco_t = Wcoco.transpose() Wcoco_t.shape Wcoco_t_train = Wcoco_t[spl['train']] Wcoco_train = Wcoco_t_train.transpose() print "Wcoco", Wcoco_train.shape total = 0 for rx in range(Wcoco_train.shape[1]): if len(np.nonzero(Wcoco_train[:, rx])[0]): total += 1 print "Total", total print "Train linwac" linwac.train_vocab_abssample(Wcoco_train,Xcoco_train,w2v,\ nsamp=Xcoco_train.shape[0],scorp="rcoco",ssim="w2v_zeroshot_split"+str(x),word_list=long_noun_list) #linwac.train_all_nouns(Wcoco_train,Xcoco_train,w2v,ssim="refcoco_w2v_zeroshot_split"+str(x),word_list=long_noun_list) print "Train transfer" linmap.train_mappings(msim.w2v_vecs, long_noun_ind, Wcoco_train, Xcoco_train, split="_refcoco_zeroshot_split" + str(x)) print "Train logwac" logwac.train_saia_nosamp(Xsaia_train, Wsaia_train, word_list=noun_list, ssim="nouns_zeroshot_split" + str(x))