Python load_voc Exemples, jnt.common.load_voc Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : pairs_voc.py Projet : mjj203/taxi-1

def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print("hyponym\thypernym\tfreq", file=out)
        voc = load_voc(voc_fpath,
                       preprocess=True,
                       sep='\t',
                       use_pickle=True,
                       silent=False)

        hh_df = read_csv(hh_fpath,
                         encoding='utf-8',
                         delimiter="\t",
                         error_bad_lines=False,
                         low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print(i)
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
                else:
                    if row.hyponym in voc or row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
            except:
                print("Bad row:", row)
                print(format_exc())

        print("Output:", output_fpath)

Exemple #2

0

Afficher le fichier

Fichier : pairs_voc.py Projet : luisfgutierrez/taxi

def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print >> out, "hyponym\thypernym\tfreq"
        voc = load_voc(voc_fpath, preprocess=True, sep='\t', use_pickle=True, silent=False)

        hh_df = read_csv(hh_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print i
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
                else:
                    if row.hyponym in voc or row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
            except:
                print "Bad row:", row
                print format_exc()

        print "Output:", output_fpath

Exemple #3

0

Afficher le fichier

Fichier : synset_fetchers.py Projet : tudarmstadt-lt/vec2synset

def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath

Exemple #4

0

Afficher le fichier

def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath

Exemple #5

0

Afficher le fichier

import argparse
from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number

ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [
        w for w in context.split(" ")

Exemple #6

0

Afficher le fichier

from jnt.common import load_voc
import codecs 
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS
from jnt.common import take

MAX_WORDS = 999

voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
output_fpath = voc_fpath + "-babelnet.csv"
babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/"
adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv"


babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir,
                    freq_fpath="", divide_by_freq=False)
adagram_voc = load_voc(adagram_voc_fpath)
voc = load_voc(voc_fpath)


with codecs.open(output_fpath, "w", "utf-8") as out:
    for word in voc:
        senses = babelnet.get_senses(word)
        for sense_id, bow in senses:
            bow_words = []
            for w in sorted(bow, key=bow.get, reverse=True):
                if w in adagram_voc and w != word:
                    bow_words.append(w) 
            out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words))))
        
print "Output:", output_fpath

Exemple #7

0

Afficher le fichier

Fichier : classifier.py Projet : tudarmstadt-lt/vec2synset

from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number


ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [w for w in context.split(" ") if w.strip() != "" and w not in _stoplist and not re_number.match(w)]
    if remove_target: