Beispiel #1
0
def main():
    optparser = create_option_parser()
    (input_file, bound, _scorer, iters, srcstop, tgtstop, gold, rem,
     bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min,
     uniset_max) = parse_options(optparser)
    scorer = getattr(DictBuilder, _scorer)

    backup = rem is not None

    bc = BiCorpus(backup=backup, int_tokens=True)

    bc.set_stopwords(srcstop, tgtstop)

    bc.read_from_file(file(input_file))

    bc.remove_ngram_pairs(gold)

    db = DictBuilder(bc, scorer, bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min, uniset_max)
    
    db.build(bound, iters=iters)
    for p in db._dict:
        if len(p) == 2:
            src, tgt = p
            print "{0}\t{1}\t{2}".format(db._dict[p],
                                      " ".join(bc._src.ints_to_tokens(src)),
                                      " ".join(bc._tgt.ints_to_tokens(tgt)),)
        elif len(p) == 3:
            src, tgt, _ = p
            for src_tok in src:
                for tgt_tok in tgt:
                    print "{0}\t{1}\t{2}".format(db._dict[p],
                                      " ".join(bc._src.ints_to_tokens(src_tok)),
                                      " ".join(bc._tgt.ints_to_tokens(tgt_tok)),)
    if rem is not None:
        bc.write(open(rem, "w")) 
Beispiel #2
0
def main():
    optparser = create_option_parser()
    (input_file, bound, _scorer, iters, srcstop, tgtstop, gold, rem,
     bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min,
     uniset_max) = parse_options(optparser)
    scorer = getattr(DictBuilder, _scorer)

    backup = rem is not None

    bc = BiCorpus(backup=backup, int_tokens=True)

    bc.set_stopwords(srcstop, tgtstop)

    bc.read_from_file(file(input_file))

    bc.remove_ngram_pairs(gold)

    db = DictBuilder(bc, scorer, bound_multiplier, strdiff, ngrams, sets,
                     sparse_bound, uniset_min, uniset_max)

    db.build(bound, iters=iters)
    for p in db._dict:
        if len(p) == 2:
            src, tgt = p
            print "{0}\t{1}\t{2}".format(
                db._dict[p],
                " ".join(bc._src.ints_to_tokens(src)),
                " ".join(bc._tgt.ints_to_tokens(tgt)),
            )
        elif len(p) == 3:
            src, tgt, _ = p
            for src_tok in src:
                for tgt_tok in tgt:
                    print "{0}\t{1}\t{2}".format(
                        db._dict[p],
                        " ".join(bc._src.ints_to_tokens(src_tok)),
                        " ".join(bc._tgt.ints_to_tokens(tgt_tok)),
                    )
    if rem is not None:
        bc.write(open(rem, "w"))
Beispiel #3
0
def bests_for_src(src):
    src = bc._src.tokens_to_ints([src])[0]
    return __choose_bests([(bc._tgt.ints_to_tokens([k])[0], len(v)) for k, v in bc._coocc_cache._cache[src].items()])

def bests_for_tgt(tgt):
    tgt = bc._tgt.tokens_to_ints([tgt])[0]
    return __choose_bests([(bc._src.ints_to_tokens([src])[0], len(bc._coocc_cache._cache[src][_tgt])) for src in bc._coocc_cache._cache for _tgt in bc._coocc_cache._cache[src] if _tgt == tgt])

def src_occ(src):
    src = bc._src.tokens_to_ints([src])[0]
    return len(bc._src._index[src])

def tgt_occ(tgt):
    tgt = bc._tgt.tokens_to_ints([tgt])[0]
    return len(bc._tgt._index[tgt])

def build():
    db = DictBuilder(bc, DictBuilder.wmi)
    db.build(0.000005, iters=3)
    return dict([((tuple(bc._src.ints_to_tokens(pair[0])), tuple(bc._tgt.ints_to_tokens(pair[1]))), score) for pair, score in db._dict._dict.items()])

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s : %(module)s - %(levelname)s - %(message)s")
    bc = BiCorpus(backup=False, int_tokens=True)
    punct = set([".", "!", "?", ",", "-", ":", "'", "...", "--", ";", "(", ")", "\""])
    bc.set_stopwords(set(file("/home/zseder/Proj/langtools/Data/hungarian_stopwords.sziget").read().rstrip("\n").split("\n")) | punct,
                     set(file("/home/zseder/Proj/langtools/Data/english_stopwords").read().rstrip("\n").split("\n")) | punct)
    bc.read_from_file(file("/home/zseder/Data/HunglishBicorpus/bi/bi_only_stem"))
    #bc.read_from_file(file("/home/zseder/Proj/Multidict/src/hundict/small_corpus"))

Beispiel #4
0
def tgt_occ(tgt):
    tgt = bc._tgt.tokens_to_ints([tgt])[0]
    return len(bc._tgt._index[tgt])


def build():
    db = DictBuilder(bc, DictBuilder.wmi)
    db.build(0.000005, iters=3)
    return dict([((tuple(bc._src.ints_to_tokens(pair[0])),
                   tuple(bc._tgt.ints_to_tokens(pair[1]))), score)
                 for pair, score in db._dict._dict.items()])


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s : %(module)s - %(levelname)s - %(message)s")
    bc = BiCorpus(backup=False, int_tokens=True)
    punct = set(
        [".", "!", "?", ",", "-", ":", "'", "...", "--", ";", "(", ")", "\""])
    bc.set_stopwords(
        set(
            file("/home/zseder/Proj/langtools/Data/hungarian_stopwords.sziget"
                 ).read().rstrip("\n").split("\n")) | punct,
        set(
            file("/home/zseder/Proj/langtools/Data/english_stopwords").read().
            rstrip("\n").split("\n")) | punct)
    bc.read_from_file(
        file("/home/zseder/Data/HunglishBicorpus/bi/bi_only_stem"))
    #bc.read_from_file(file("/home/zseder/Proj/Multidict/src/hundict/small_corpus"))