Exemple #1
0
with_lsa = False    

# load vector spaces    
if with_lsa:
    tl.nyt_sim = tl.Sim('_vsm_data/nyt_words.txt', '_vsm_data/nyt_word_vectors.txt')
    tl.wiki_sim = tl.Sim('_vsm_data/wikipedia_words.txt', '_vsm_data/wikipedia_word_vectors.txt')

# create training instances
train_dir = "../../data/STS2012-train"

for data in "MSRpar", "MSRvid", "SMTeuroparl":
    out_fname = "_npz_data/_STS2012.train.{}.npz".format(data)
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features("{}/STS.input.{}.txt".format(train_dir, data),
                         "{}/STS.gs.{}.txt".format(train_dir, data),
                         outf=out_fname, 
                         out_format="numpy", 
                         with_lsa=with_lsa)
    
# create test instances
test_dir =  "../../data/STS2012-test"  

for data in "MSRpar", "MSRvid", "SMTeuroparl", "surprise.OnWN", "surprise.SMTnews":
    out_fname = "_npz_data/_STS2012.test.{}.npz".format(data)
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features("{}/STS.input.{}.txt".format(test_dir, data),
                         "{}/STS.gs.{}.txt".format(test_dir, data),
                         outf=out_fname, 
                         out_format="numpy", 
                         with_lsa=with_lsa)
#!/usr/bin/python

"""
make features for part of hlines
"""

# NB I ran a variant of this on the Translate server

import sys

sys.path.append("./lib/python")

import takelab.simpfeats as tl


# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq_hlines.txt")
tl.minwweight = min(tl.wweight.values())

for in_fname in sys.argv[1:]:
    out_fname = in_fname + ".npz"
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features(in_fname,
                         outf=out_fname, 
                         out_format="numpy", 
                         with_lsa=False)
    
#!/usr/bin/python

"""
make features for part of hlines
"""

# NB I ran a variant of this on the Translate server

import sys

sys.path.append("./lib/python")

import takelab.simpfeats as tl


# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq_hlines.txt")
tl.minwweight = min(tl.wweight.values())

for in_fname in sys.argv[1:]:
    out_fname = in_fname + ".npz"
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features(in_fname, outf=out_fname, out_format="numpy", with_lsa=False)