#!/usr/bin/python """ make features for part of hlines """ # NB I ran a variant of this on the Translate server import sys sys.path.append("./lib/python") import takelab.simpfeats as tl # load word counts for IC weighting tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq_hlines.txt") tl.minwweight = min(tl.wweight.values()) for in_fname in sys.argv[1:]: out_fname = in_fname + ".npz" sys.stderr.write("creating {}\n".format(out_fname)) tl.generate_features(in_fname, outf=out_fname, out_format="numpy", with_lsa=False)
#!/usr/bin/env python """ make features for STS training and test data for use with NTNU system """ import sys from os.path import join, exists from os import makedirs import takelab.simpfeats as tl from sts import sts12, sts13 # load word counts for IC weighting tl.wweight = tl.load_wweight_table("../_data/wordfreq/wordfreq-STS.txt") tl.minwweight = min(tl.wweight.values()) # load vector spaces tl.nyt_sim = tl.Sim('../_data/lsa-matrices/nyt-words-sts.txt', '../_data/lsa-matrices/nyt-matrix-sts.txt') tl.wiki_sim = tl.Sim('../_data/lsa-matrices/wiki-words-sts.txt', '../_data/lsa-matrices/wiki-matrix-sts.txt') with_lsa = True dest_dir = "../out/STS2012-train" for data_id in sts12.train_ids:
#!/usr/bin/env python """ make features for STS12 training and test data """ import sys import takelab.simpfeats as tl # load word counts for IC weighting tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq-STS2012.txt") tl.minwweight = min(tl.wweight.values()) with_lsa = False # load vector spaces if with_lsa: tl.nyt_sim = tl.Sim('_vsm_data/nyt_words.txt', '_vsm_data/nyt_word_vectors.txt') tl.wiki_sim = tl.Sim('_vsm_data/wikipedia_words.txt', '_vsm_data/wikipedia_word_vectors.txt') # create training instances train_dir = "../../data/STS2012-train" for data in "MSRpar", "MSRvid", "SMTeuroparl": out_fname = "_npz_data/_STS2012.train.{}.npz".format(data) sys.stderr.write("creating {}\n".format(out_fname)) tl.generate_features("{}/STS.input.{}.txt".format(train_dir, data), "{}/STS.gs.{}.txt".format(train_dir, data), outf=out_fname,
make Takelab's features for STS training and test data """ import os from os.path import join, exists from os import makedirs import sts import takelab.simpfeats as tl # requires Takelab LSA models TL_DATA_DIR = "_data" with_lsa = True # load word counts for IC weighting tl.wweight = tl.load_wweight_table(os.path.join(TL_DATA_DIR, "wordfreq/wordfreq-STS.txt")) tl.minwweight = min(tl.wweight.values()) if with_lsa: # load vector spaces tl.nyt_sim = tl.Sim( os.path.join(TL_DATA_DIR, "lsa-matrices/nyt-words.txt"), os.path.join(TL_DATA_DIR, "lsa-matrices/nyt-matrix.txt"), ) tl.wiki_sim = tl.Sim( os.path.join(TL_DATA_DIR, "lsa-matrices/wiki-words.txt"), os.path.join(TL_DATA_DIR, "lsa-matrices/wiki-matrix.txt"), ) def make_feats(ids2fnames, dest_dir, with_lsa=True):