Exemple #1
0
def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print("hyponym\thypernym\tfreq", file=out)
        voc = load_voc(voc_fpath,
                       preprocess=True,
                       sep='\t',
                       use_pickle=True,
                       silent=False)

        hh_df = read_csv(hh_fpath,
                         encoding='utf-8',
                         delimiter="\t",
                         error_bad_lines=False,
                         low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print(i)
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
                else:
                    if row.hyponym in voc or row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
            except:
                print("Bad row:", row)
                print(format_exc())

        print("Output:", output_fpath)
Exemple #2
0
def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print >> out, "hyponym\thypernym\tfreq"
        voc = load_voc(voc_fpath, preprocess=True, sep='\t', use_pickle=True, silent=False)

        hh_df = read_csv(hh_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print i
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
                else:
                    if row.hyponym in voc or row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
            except:
                print "Bad row:", row
                print format_exc()

        print "Output:", output_fpath
def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath
Exemple #4
0
def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath
Exemple #5
0
import argparse
from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number

ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [
        w for w in context.split(" ")
Exemple #6
0
from jnt.common import load_voc
import codecs 
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS
from jnt.common import take

MAX_WORDS = 999

voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
output_fpath = voc_fpath + "-babelnet.csv"
babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/"
adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv"


babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir,
                    freq_fpath="", divide_by_freq=False)
adagram_voc = load_voc(adagram_voc_fpath)
voc = load_voc(voc_fpath)


with codecs.open(output_fpath, "w", "utf-8") as out:
    for word in voc:
        senses = babelnet.get_senses(word)
        for sense_id, bow in senses:
            bow_words = []
            for w in sorted(bow, key=bow.get, reverse=True):
                if w in adagram_voc and w != word:
                    bow_words.append(w) 
            out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words))))
        
print "Output:", output_fpath
from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number


ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [w for w in context.split(" ") if w.strip() != "" and w not in _stoplist and not re_number.match(w)]
    if remove_target: