def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False): with codecs.open(output_fpath, "w", "utf-8") as out: print("hyponym\thypernym\tfreq", file=out) voc = load_voc(voc_fpath, preprocess=True, sep='\t', use_pickle=True, silent=False) hh_df = read_csv(hh_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, low_memory=False) for i, row in hh_df.iterrows(): try: if i % 100000 == 0: print(i) if both_in_voc: if row.hyponym in voc and row.hypernym in voc: print("%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq), file=out) else: if row.hyponym in voc or row.hypernym in voc: print("%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq), file=out) except: print("Bad row:", row) print(format_exc()) print("Output:", output_fpath)
def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False): with codecs.open(output_fpath, "w", "utf-8") as out: print >> out, "hyponym\thypernym\tfreq" voc = load_voc(voc_fpath, preprocess=True, sep='\t', use_pickle=True, silent=False) hh_df = read_csv(hh_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, low_memory=False) for i, row in hh_df.iterrows(): try: if i % 100000 == 0: print i if both_in_voc: if row.hyponym in voc and row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq) else: if row.hyponym in voc or row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq) except: print "Bad row:", row print format_exc() print "Output:", output_fpath
def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath): # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz" # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv" # ddt_filtered_fpath = ddt_fpath + ".voc" voc = load_voc(voc_fpath) with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out: num = 0 found_voc = set() for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")): if i % 100000 == 0: print i, num f = line.split("\t") if len(f) < 1: continue if f[0] in voc: num += 1 found_voc.add(f[0]) out.write(line) print "Input processed:", i print "Words found:", len(found_voc), "of", len(voc) print "Senses written:", num print "Filtered by vocabulary DDT:", ddt_filtered_fpath
import argparse from os.path import splitext from os.path import join from jnt.common import exists from subprocess import Popen, PIPE import os from os.path import splitext from jnt.morph import get_stoplist from jnt.patterns import re_number ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv" DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv" DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/" ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/" _adagram_voc = load_voc(ADAGRAM_VOC, silent=True) _stoplist = get_stoplist() def filter_voc(text): text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc] return " ".join(text_adagram) TARGET_BEG = "(((" TARGET_END = ")))" def filter_context(context, target, remove_target, context_size): context = [ w for w in context.split(" ")
from jnt.common import load_voc import codecs from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS from jnt.common import take MAX_WORDS = 999 voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv" output_fpath = voc_fpath + "-babelnet.csv" babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/" adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv" babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir, freq_fpath="", divide_by_freq=False) adagram_voc = load_voc(adagram_voc_fpath) voc = load_voc(voc_fpath) with codecs.open(output_fpath, "w", "utf-8") as out: for word in voc: senses = babelnet.get_senses(word) for sense_id, bow in senses: bow_words = [] for w in sorted(bow, key=bow.get, reverse=True): if w in adagram_voc and w != word: bow_words.append(w) out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words)))) print "Output:", output_fpath
from os.path import splitext from os.path import join from jnt.common import exists from subprocess import Popen, PIPE import os from os.path import splitext from jnt.morph import get_stoplist from jnt.patterns import re_number ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv" DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv" DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/" ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/" _adagram_voc = load_voc(ADAGRAM_VOC, silent=True) _stoplist = get_stoplist() def filter_voc(text): text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc] return " ".join(text_adagram) TARGET_BEG = "(((" TARGET_END = ")))" def filter_context(context, target, remove_target, context_size): context = [w for w in context.split(" ") if w.strip() != "" and w not in _stoplist and not re_number.match(w)] if remove_target: