Python load_voc Beispiele, jnt.common.load_voc Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: pairs_voc.py Projekt: mjj203/taxi-1

def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print("hyponym\thypernym\tfreq", file=out)
        voc = load_voc(voc_fpath,
                       preprocess=True,
                       sep='\t',
                       use_pickle=True,
                       silent=False)

        hh_df = read_csv(hh_fpath,
                         encoding='utf-8',
                         delimiter="\t",
                         error_bad_lines=False,
                         low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print(i)
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
                else:
                    if row.hyponym in voc or row.hypernym in voc:
                        print("%s\t%s\t%d" %
                              (row.hyponym, row.hypernym, row.freq),
                              file=out)
            except:
                print("Bad row:", row)
                print(format_exc())

        print("Output:", output_fpath)

Beispiel #2

0

Datei anzeigen

Datei: pairs_voc.py Projekt: luisfgutierrez/taxi

def filter_by_voc(hh_fpath, voc_fpath, output_fpath, both_in_voc=False):
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print >> out, "hyponym\thypernym\tfreq"
        voc = load_voc(voc_fpath, preprocess=True, sep='\t', use_pickle=True, silent=False)

        hh_df = read_csv(hh_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, low_memory=False)

        for i, row in hh_df.iterrows():
            try:
                if i % 100000 == 0: print i
                if both_in_voc:
                    if row.hyponym in voc and row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
                else:
                    if row.hyponym in voc or row.hypernym in voc: print >> out, "%s\t%s\t%d" % (row.hyponym, row.hypernym, row.freq)
            except:
                print "Bad row:", row
                print format_exc()

        print "Output:", output_fpath

Beispiel #3

0

Datei anzeigen

Datei: synset_fetchers.py Projekt: tudarmstadt-lt/vec2synset

def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath

Beispiel #4

0

Datei anzeigen

def filter_ddt_by_voc(ddt_fpath, voc_fpath, ddt_filtered_fpath):

    # ddt_fpath = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz"
    # voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words.csv"
    # ddt_filtered_fpath = ddt_fpath + ".voc"

    voc = load_voc(voc_fpath)

    with codecs.open(ddt_filtered_fpath, "w", "utf-8") as out:
        num = 0
        found_voc = set()
        for i, line in enumerate(gzip.open(ddt_fpath, "rb", "utf-8")):
            if i % 100000 == 0: print i, num
            f = line.split("\t")
            if len(f) < 1: continue
            if f[0] in voc:
                num += 1
                found_voc.add(f[0])
                out.write(line)

    print "Input processed:", i
    print "Words found:", len(found_voc), "of", len(voc)
    print "Senses written:", num
    print "Filtered by vocabulary DDT:", ddt_filtered_fpath

Beispiel #5

0

Datei anzeigen

import argparse
from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number

ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [
        w for w in context.split(" ")

Beispiel #6

0

Datei anzeigen

from jnt.common import load_voc
import codecs 
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS
from jnt.common import take

MAX_WORDS = 999

voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
output_fpath = voc_fpath + "-babelnet.csv"
babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/"
adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv"


babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir,
                    freq_fpath="", divide_by_freq=False)
adagram_voc = load_voc(adagram_voc_fpath)
voc = load_voc(voc_fpath)


with codecs.open(output_fpath, "w", "utf-8") as out:
    for word in voc:
        senses = babelnet.get_senses(word)
        for sense_id, bow in senses:
            bow_words = []
            for w in sorted(bow, key=bow.get, reverse=True):
                if w in adagram_voc and w != word:
                    bow_words.append(w) 
            out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words))))
        
print "Output:", output_fpath

Beispiel #7

0

Datei anzeigen

Datei: classifier.py Projekt: tudarmstadt-lt/vec2synset

from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number


ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [w for w in context.split(" ") if w.strip() != "" and w not in _stoplist and not re_number.match(w)]
    if remove_target: