def train_zero_saia_longlist(word_list=noun_list):

    with open('../indata/saia_zeroshot_nounslong_splits.json', 'r') as f:
        ttsplit = json.load(f)

    w2v = linwac.load_w2v()

    X = np.load('../indata/saiapr.npz')
    Xsaia = X['arr_0']
    print "Xsaia", Xsaia.shape

    W = np.load('../indata/saiapr_wmat.npz')
    Wsaia = W['arr_0']
    print "Wsaia", Wsaia.shape

    long_noun_list = [
        l.strip() for l in open('noun_list_long.txt').readlines()
    ]
    long_noun_ind = [msim.word2ind[n] for n in long_noun_list]

    for x, spl in enumerate(ttsplit):
        print "SPLIT", x

        if x > 0:

            Xsaia_train = Xsaia[spl['train']]
            Xsaia_test = Xsaia[spl['test']]

            Wsaia_t = Wsaia.transpose()
            Wsaia_t.shape

            Wsaia_t_train = Wsaia_t[spl['train']]
            Wsaia_train = Wsaia_t_train.transpose()

            print "Train linwac"
            linwac.train_all_nouns(Wsaia_train,
                                   Xsaia_train,
                                   w2v,
                                   ssim="500n_zeroshot_split" + str(x),
                                   word_list=long_noun_list)

            print "Train transfer"
            linmap.train_mappings(msim.w2v_vecs,
                                  long_noun_ind,
                                  Wsaia_train,
                                  Xsaia_train,
                                  split="500n_zeroshot_split" + str(x))

            print "Train logwac"
            logwac.train_saia_nosamp(Xsaia_train,
                                     Wsaia_train,
                                     word_list=noun_list,
                                     ssim="nouns_zeroshot_split" + str(x))
def train_zero_mixed_plural_saia_models():

    with open('../indata/saia_zeroshot_mixedpluralsplit.json', 'r') as f:
        ttsplit = json.load(f)

    w2v = linwac.load_w2v()

    X = np.load('../indata/saiapr.npz')
    Xsaia = X['arr_0']
    print "Xsaia", Xsaia.shape

    W = np.load('../indata/saiapr_wmat.npz')
    Wsaia = W['arr_0']
    print "Wsaia", Wsaia.shape

    Xsaia_train = Xsaia[ttsplit['train']]
    Xsaia_test = Xsaia[ttsplit['test']]

    Wsaia_t = Wsaia.transpose()
    Wsaia_t.shape
    Wsaia_t_train = Wsaia_t[ttsplit['train']]
    Wsaia_train = Wsaia_t_train.transpose()

    print "Plurals", ttsplit['nouns']
    print "Singulars", ttsplit['singulars']

    print "Train linwac"
    this_wordlist = ttsplit['nouns'] + ttsplit['singulars']
    word_ind = [msim.word2ind[n] for n in this_wordlist]

    print "Wordlist", len(this_wordlist)
    linwac.train_all_nouns(Wsaia_train,
                           Xsaia_train,
                           w2v,
                           ssim="_zeroshot_mixedpluralsplit",
                           word_list=this_wordlist)

    print "Train transfer"
    linmap.train_mappings(msim.w2v_vecs,
                          word_ind,
                          Wsaia_train,
                          Xsaia_train,
                          split="_zeroshot_mixedpluralsplit")

    print "Train logwac"
    logwac.train_saia_nosamp(Xsaia_train,
                             Wsaia_train,
                             word_list=this_wordlist,
                             ssim="nouns_zeroshot_mixedpluralsplit")
def train_zero_hypern_saia_models():

    with open('../indata/saia_zeroshot_hypernsplit.json', 'r') as f:
        ttsplit = json.load(f)

    w2v = linwac.load_w2v()

    X = np.load('../indata/saiapr.npz')
    Xsaia = X['arr_0']
    print "Xsaia", Xsaia.shape

    W = np.load('../indata/saiapr_wmat.npz')
    Wsaia = W['arr_0']
    print "Wsaia", Wsaia.shape

    Xsaia_train = Xsaia[ttsplit['train']]
    Xsaia_test = Xsaia[ttsplit['test']]

    Wsaia_t = Wsaia.transpose()
    Wsaia_t.shape
    Wsaia_t_train = Wsaia_t[ttsplit['train']]
    Wsaia_train = Wsaia_t_train.transpose()

    print ttsplit['nouns']

    print "Train linwac"
    this_wordlist = noun_list + [
        n for n in ttsplit['nouns'] if not n in noun_list
    ]
    print "Wordlist", len(this_wordlist)
    linwac.train_all_nouns(Wsaia_train,
                           Xsaia_train,
                           w2v,
                           ssim="_zeroshot_hypernsplit",
                           word_list=this_wordlist)

    print "Train transfer"
    linmap.train_mappings(msim.w2v_vecs,
                          noun_ind,
                          Wsaia_train,
                          Xsaia_train,
                          split="_zeroshot_hypernsplit")

    print "Train logwac"
    logwac.train_saia_nosamp(Xsaia_train,
                             Wsaia_train,
                             word_list=noun_list,
                             ssim="nouns_zeroshot_hypernsplit")
def train_zero_saia_models(word_list=noun_list):

    with open('../indata/saia_zeroshot_nounsplits.json', 'r') as f:
        ttsplit = json.load(f)

    w2v = linwac.load_w2v()

    X = np.load('../indata/saiapr.npz')
    Xsaia = X['arr_0']
    print "Xsaia", Xsaia.shape

    W = np.load('../indata/saiapr_wmat.npz')
    Wsaia = W['arr_0']
    print "Wsaia", Wsaia.shape

    for x, spl in enumerate(ttsplit):
        print "SPLIT", x

        if x > 0:

            Xsaia_train = Xsaia[spl['train']]
            Xsaia_test = Xsaia[spl['test']]

            Wsaia_t = Wsaia.transpose()
            Wsaia_t.shape

            Wsaia_t_train = Wsaia_t[spl['train']]
            Wsaia_train = Wsaia_t_train.transpose()

            print "Train linwac"
            linwac.train_all_nouns(Wsaia_train,
                                   Xsaia_train,
                                   w2v,
                                   ssim="_zeroshot_split" + str(x))

            print "Train transfer"
            linmap.train_mappings(msim.w2v_vecs,
                                  noun_ind,
                                  Wsaia_train,
                                  Xsaia_train,
                                  split="_zeroshot_split" + str(x))

            print "Train logwac"
            logwac.train_saia_nosamp(Xsaia_train,
                                     Wsaia_train,
                                     word_list=noun_list,
                                     ssim="nouns_zeroshot_split" + str(x))
Exemple #5
0
def make_results_randomsplits():

    with gzip.open('../indata/saia_zeroshot_nounsplits_testsets.pklz',
                   'r') as f:
        testsets = pickle.load(f)
    with open('../indata/saia_zeroshot_nounsplits.json', 'r') as f:
        ttsplit = json.load(f)

    print "testsets", len(testsets)

    w2v = linwac.load_w2v()
    nouns_w2v_mat = np.array([msim.w2v_vecs[msim.word2ind[n]] for n in NOUNS])
    print "W2v matrix", nouns_w2v_mat.shape

    mapmodels = [
        '../linmodels/linmap_nouns_zeroshot_split' + str(x) + '.pklz'
        for x in range(10)
    ]
    linmodels = [
        '../linmodels/linwac_nouns_w2v_zeroshot_split' + str(x) + '.pklz'
        for x in range(10)
    ]
    logmodels = [
        '../logmodels/logwac_saia_nouns_zeroshot_split' + str(x) +
        '_nosamp.pklz' for x in range(10)
    ]

    print "*****Eval mapmodels"
    res1 = eval_map_models(testsets[:10], ttsplit[:10], mapmodels,
                           nouns_w2v_mat, NOUNS)
    print "*****Eval logmodels"
    res2 = eval_log_models(testsets[:10], ttsplit[:10], logmodels,
                           nouns_w2v_mat, NOUNS)
    print "*****Eval linmodels"
    res3 = eval_lin_models(testsets[:10], ttsplit[:10], linmodels,
                           nouns_w2v_mat, NOUNS)

    results = []
    for (x, y, a, b, c, d) in res1 + res2 + res3:
        results.append((x, y, "%.2f" % (a * 100), "%.2f" % (b * 100),
                        "%.2f" % (c * 100), "%.2f" % (d * 100)))

    df = pd.DataFrame(results,
                      columns=['testset', 'model', '@1', '@2', '@5', '@10'])

    print df.to_latex(index=False)
def train_standard_plural_saia_models():

    with open('../indata/saia_standard_pluralsplit.json', 'r') as f:
        ttsplit = json.load(f)

    print "Nouns", len(ttsplit['nouns'])

    w2v = linwac.load_w2v()

    Xsaia_t, Wsaia_t = linwac.load_saia_train()

    Xsaia_train = Xsaia_t[ttsplit['train']]

    Wsaia_tt = Wsaia_t.transpose()
    print Wsaia_tt.shape
    Wsaia_t_train = Wsaia_tt[ttsplit['train']]
    Wsaia_train = Wsaia_t_train.transpose()

    print "Train linwac"
    this_wordlist = ttsplit['nouns']
    word_ind = [msim.word2ind[n] for n in this_wordlist]

    print "Wordlist", len(this_wordlist)
    linwac.train_all_nouns(Wsaia_train,
                           Xsaia_train,
                           w2v,
                           ssim="_standard_pluralsplit",
                           word_list=this_wordlist)

    print "Train transfer"
    linmap.train_mappings(msim.w2v_vecs,
                          word_ind,
                          Wsaia_train,
                          Xsaia_train,
                          split="_standard_pluralsplit")

    print "Train logwac"
    logwac.train_saia_nosamp(Xsaia_train,
                             Wsaia_train,
                             word_list=this_wordlist,
                             ssim="nouns_standard_pluralsplit")
def train_zero_refcoco_models():

    long_noun_list = [
        l.strip() for l in open('noun_list_long.txt').readlines()
    ]
    long_noun_ind = [msim.word2ind[n] for n in long_noun_list]

    with open('../indata/refcoco_zeroshot_nounsplits.json', 'r') as f:
        ttsplit = json.load(f)

    w2v = linwac.load_w2v()

    X = np.load('../indata/mscoco.npz')
    Xcoco = X['arr_0']
    print "Xcoco", Xcoco.shape

    W = np.load('../indata/mscoco_refcoco_wmat.npz')
    Wcoco = W['arr_0']
    print "Wcoco", Wcoco.shape

    total = 0
    for rx in range(Wcoco.shape[1]):
        if len(np.nonzero(Wcoco[:, rx])[0]):
            total += 1
    print "Total", total

    with gzip.open('../indata/refcoco_refdf.pklz', 'r') as f:
        refdf = pickle.load(f)

    print "Regionids", len(set(refdf['region_id']))

    for x, spl in enumerate(ttsplit):
        print "SPLIT", x

        if x == 2:

            Xcoco_train = Xcoco[spl['train']]
            Xcoco_test = Xcoco[spl['test']]

            print "Xcoco", Xcoco_train.shape

            Wcoco_t = Wcoco.transpose()
            Wcoco_t.shape
            Wcoco_t_train = Wcoco_t[spl['train']]
            Wcoco_train = Wcoco_t_train.transpose()

            print "Wcoco", Wcoco_train.shape

            total = 0
            for rx in range(Wcoco_train.shape[1]):
                if len(np.nonzero(Wcoco_train[:, rx])[0]):
                    total += 1
            print "Total", total

            print "Train linwac"
            linwac.train_vocab_abssample(Wcoco_train,Xcoco_train,w2v,\
                nsamp=Xcoco_train.shape[0],scorp="rcoco",ssim="w2v_zeroshot_split"+str(x),word_list=long_noun_list)
            #linwac.train_all_nouns(Wcoco_train,Xcoco_train,w2v,ssim="refcoco_w2v_zeroshot_split"+str(x),word_list=long_noun_list)

            print "Train transfer"
            linmap.train_mappings(msim.w2v_vecs,
                                  long_noun_ind,
                                  Wcoco_train,
                                  Xcoco_train,
                                  split="_refcoco_zeroshot_split" + str(x))

            print "Train logwac"
            logwac.train_saia_nosamp(Xsaia_train,
                                     Wsaia_train,
                                     word_list=noun_list,
                                     ssim="nouns_zeroshot_split" + str(x))