def main(): optparser = create_option_parser() (input_file, bound, _scorer, iters, srcstop, tgtstop, gold, rem, bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min, uniset_max) = parse_options(optparser) scorer = getattr(DictBuilder, _scorer) backup = rem is not None bc = BiCorpus(backup=backup, int_tokens=True) bc.set_stopwords(srcstop, tgtstop) bc.read_from_file(file(input_file)) bc.remove_ngram_pairs(gold) db = DictBuilder(bc, scorer, bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min, uniset_max) db.build(bound, iters=iters) for p in db._dict: if len(p) == 2: src, tgt = p print "{0}\t{1}\t{2}".format(db._dict[p], " ".join(bc._src.ints_to_tokens(src)), " ".join(bc._tgt.ints_to_tokens(tgt)),) elif len(p) == 3: src, tgt, _ = p for src_tok in src: for tgt_tok in tgt: print "{0}\t{1}\t{2}".format(db._dict[p], " ".join(bc._src.ints_to_tokens(src_tok)), " ".join(bc._tgt.ints_to_tokens(tgt_tok)),) if rem is not None: bc.write(open(rem, "w"))
def main(): optparser = create_option_parser() (input_file, bound, _scorer, iters, srcstop, tgtstop, gold, rem, bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min, uniset_max) = parse_options(optparser) scorer = getattr(DictBuilder, _scorer) backup = rem is not None bc = BiCorpus(backup=backup, int_tokens=True) bc.set_stopwords(srcstop, tgtstop) bc.read_from_file(file(input_file)) bc.remove_ngram_pairs(gold) db = DictBuilder(bc, scorer, bound_multiplier, strdiff, ngrams, sets, sparse_bound, uniset_min, uniset_max) db.build(bound, iters=iters) for p in db._dict: if len(p) == 2: src, tgt = p print "{0}\t{1}\t{2}".format( db._dict[p], " ".join(bc._src.ints_to_tokens(src)), " ".join(bc._tgt.ints_to_tokens(tgt)), ) elif len(p) == 3: src, tgt, _ = p for src_tok in src: for tgt_tok in tgt: print "{0}\t{1}\t{2}".format( db._dict[p], " ".join(bc._src.ints_to_tokens(src_tok)), " ".join(bc._tgt.ints_to_tokens(tgt_tok)), ) if rem is not None: bc.write(open(rem, "w"))
def bests_for_src(src): src = bc._src.tokens_to_ints([src])[0] return __choose_bests([(bc._tgt.ints_to_tokens([k])[0], len(v)) for k, v in bc._coocc_cache._cache[src].items()]) def bests_for_tgt(tgt): tgt = bc._tgt.tokens_to_ints([tgt])[0] return __choose_bests([(bc._src.ints_to_tokens([src])[0], len(bc._coocc_cache._cache[src][_tgt])) for src in bc._coocc_cache._cache for _tgt in bc._coocc_cache._cache[src] if _tgt == tgt]) def src_occ(src): src = bc._src.tokens_to_ints([src])[0] return len(bc._src._index[src]) def tgt_occ(tgt): tgt = bc._tgt.tokens_to_ints([tgt])[0] return len(bc._tgt._index[tgt]) def build(): db = DictBuilder(bc, DictBuilder.wmi) db.build(0.000005, iters=3) return dict([((tuple(bc._src.ints_to_tokens(pair[0])), tuple(bc._tgt.ints_to_tokens(pair[1]))), score) for pair, score in db._dict._dict.items()]) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(asctime)s : %(module)s - %(levelname)s - %(message)s") bc = BiCorpus(backup=False, int_tokens=True) punct = set([".", "!", "?", ",", "-", ":", "'", "...", "--", ";", "(", ")", "\""]) bc.set_stopwords(set(file("/home/zseder/Proj/langtools/Data/hungarian_stopwords.sziget").read().rstrip("\n").split("\n")) | punct, set(file("/home/zseder/Proj/langtools/Data/english_stopwords").read().rstrip("\n").split("\n")) | punct) bc.read_from_file(file("/home/zseder/Data/HunglishBicorpus/bi/bi_only_stem")) #bc.read_from_file(file("/home/zseder/Proj/Multidict/src/hundict/small_corpus"))
def tgt_occ(tgt): tgt = bc._tgt.tokens_to_ints([tgt])[0] return len(bc._tgt._index[tgt]) def build(): db = DictBuilder(bc, DictBuilder.wmi) db.build(0.000005, iters=3) return dict([((tuple(bc._src.ints_to_tokens(pair[0])), tuple(bc._tgt.ints_to_tokens(pair[1]))), score) for pair, score in db._dict._dict.items()]) if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s : %(module)s - %(levelname)s - %(message)s") bc = BiCorpus(backup=False, int_tokens=True) punct = set( [".", "!", "?", ",", "-", ":", "'", "...", "--", ";", "(", ")", "\""]) bc.set_stopwords( set( file("/home/zseder/Proj/langtools/Data/hungarian_stopwords.sziget" ).read().rstrip("\n").split("\n")) | punct, set( file("/home/zseder/Proj/langtools/Data/english_stopwords").read(). rstrip("\n").split("\n")) | punct) bc.read_from_file( file("/home/zseder/Data/HunglishBicorpus/bi/bi_only_stem")) #bc.read_from_file(file("/home/zseder/Proj/Multidict/src/hundict/small_corpus"))