def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) ExtractKeysBatch(input_path).Run()
def main(): args = sys.argv[1:] data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "result" if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) DivideCountBatch(data_prefix).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-epub" if not input_path: raise RuntimeError("an input path is required") if not output_path: raise RuntimeError("an output path is required") GenerateUnionEPUBBatch(input_path, output_path).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "wordnet-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "wordnet-tran-index.tkh" if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) IndexTranslationsBatch(input_path, output_path).Run()
def main(): args = sys.argv[1:] dict_dir = tkrzw_dict.GetCommandFlag(args, "--dict", 1) or "dict" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "wordnet.tkh" prob_path = tkrzw_dict.GetCommandFlag(args, "--prob", 1) or "" if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) BuildWordNetDBBatch(dict_dir, output_path, prob_path).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-epub" min_prob = float(tkrzw_dict.GetCommandFlag(args, "--min_prob", 1) or 0) multi_min_prob = float(tkrzw_dict.GetCommandFlag(args, "--multi_min_prob", 1) or 0.00002) if not input_path: raise RuntimeError("an input path is required") if not output_path: raise RuntimeError("an output path is required") GenerateUnionEPUBBatch(input_path, output_path, min_prob, multi_min_prob).Run()
def main(): args = sys.argv[1:] rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or "" min_count = int(tkrzw_dict.GetCommandFlag(args, "--min_count", 1) or 10) enough_ef = float(tkrzw_dict.GetCommandFlag(args, "--enough_ef", 1) or 2.0) enough_fe = float(tkrzw_dict.GetCommandFlag(args, "--enough_fe", 1) or 2.0) omit_latin = tkrzw_dict.GetCommandFlag(args, "--omit_latin", 0) min_score = float(tkrzw_dict.GetCommandFlag(args, "--min_score", 1) or 0.25) min_score_large = float(tkrzw_dict.GetCommandFlag(args, "--min_score_large", 1) or 0.35) min_score_stop = float(tkrzw_dict.GetCommandFlag(args, "--min_score_stop", 1) or 0.3) max_targets = int(tkrzw_dict.GetCommandFlag(args, "--max_targets", 1) or 8) tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",") Run(rev_prob_path, min_count, enough_ef, enough_fe, omit_latin, min_score, min_score_large, min_score_stop, max_targets, tran_aux_paths)
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-tran-index.tkh" supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(",")) tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or "" if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) IndexTranslationsBatch(input_path, output_path, supplement_labels, tran_prob_path).Run()
def main(): args = sys.argv[1:] data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "result" language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en" text = " ".join(args) if not text: raise RuntimeError("words are not specified") predictor = tkrzw_related_word_predictor.RelatedWordsPredictor( data_prefix, language) rel_words, features = predictor.Predict(text) print("==== FEATURES ====") for feat_word, feat_score in features[:16]: print("{} = {:.2f}".format(feat_word, feat_score)) print() print("==== RELATED WORDS ====") for rel_word, rel_score in rel_words[:32]: print("{} = {:.4f}".format(rel_word, rel_score))
def main(): args = sys.argv[1:] sampling_ratio = float(tkrzw_dict.GetCommandFlag(args, "--sampling", 1) or 1.0) max_outputs = int(tkrzw_dict.GetCommandFlag(args, "--max", 1) or sys.maxsize) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) if sampling_ratio <= 0 or sampling_ratio > 1: raise ValueError("invalid sampling ratio") if max_outputs < 0: raise ValueError("invalid max outputs") logger.info("Process started") parser = xml.sax.make_parser() handler = XMLHandler(sampling_ratio, max_outputs) parser.setContentHandler(handler) try: parser.parse(sys.stdin) except xml.sax.SAXException: pass logger.info("Process done")
def main(): args = sys.argv[1:] if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) logger.info("Process started") parser = xml.sax.make_parser() handler = XMLHandler() parser.setContentHandler(handler) parser.parse(sys.stdin) print(handler.getCount(), flush=True) logger.info("Process done")
def main(): args = sys.argv[1:] data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "wordnet" direction = tkrzw_dict.GetCommandFlag(args, "--direction", 1) or "auto" show_details = tkrzw_dict.GetCommandFlag(args, "--details", 0) text = " ".join(args) if not text: raise RuntimeError("words are not specified") reverse = False if direction == "auto": reverse = tkrzw_dict.PredictLanguage(text) != "en" elif direction == "reverse": reverse = True searcher = tkrzw_wordnet_searcher.WordNetSearcher(data_prefix) if reverse: result = searcher.SearchReverse(text) else: result = searcher.SearchExact(text) if result: for key, entry in result: PrintResultWord(key, entry, show_details) else: print("No result.")
def main(): args = sys.argv[1:] phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob", 1) or "" rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or "" tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or "" tran_aux_paths = tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "" yomi_paths = tkrzw_dict.GetCommandFlag(args, "--yomi", 1) or "" min_phrase_prob = float( tkrzw_dict.GetCommandFlag(args, "--min_phrase_prob", 1) or .000001) min_tran_prob = float( tkrzw_dict.GetCommandFlag(args, "--min_tran_prob", 1) or 0.1) Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths, yomi_paths, min_phrase_prob, min_tran_prob)
def main(): args = sys.argv[1:] data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "result-para" keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or "" dict_path = tkrzw_dict.GetCommandFlag(args, "--dict", 1) or "" thes_path = tkrzw_dict.GetCommandFlag(args, "--thes", 1) or "" source_ngram = int( tkrzw_dict.GetCommandFlag(args, "--source_ngram", 1) or "3") target_ngram = int( tkrzw_dict.GetCommandFlag(args, "--target_ngram", 1) or "3") target_stem = tkrzw_dict.GetCommandFlag(args, "--target_stem", 0) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) WordCountBatch(data_prefix, keyword_path, dict_path, thes_path, source_ngram, target_ngram, target_stem).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "wordnet.thk" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "wordnet-body.tkh" wnjpn_path = tkrzw_dict.GetCommandFlag(args, "--wnjpn", 1) or "wnjpn-ok.tab" phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob", 1) or "" rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or "" tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or "" tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",") tran_subaux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_subaux", 1) or "").split(",") if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) AppendWordnetJPNBatch(input_path, output_path, wnjpn_path, phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths, tran_subaux_paths).Run()
def main(): args = sys.argv[1:] data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "result-para" min_count = int(tkrzw_dict.GetCommandFlag(args, "--min_count", 1) or 2) min_score = float(tkrzw_dict.GetCommandFlag(args, "--min_score", 1) or 0.01) enough_count = int(tkrzw_dict.GetCommandFlag(args, "--enough_count", 1) or 10) base_count = int(tkrzw_dict.GetCommandFlag(args, "--base_count", 1) or 2) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) ExtractTransBatch(data_prefix, min_count, min_score, enough_count, base_count).Run()
def main(): args = sys.argv[1:] vocab_path = tkrzw_dict.GetCommandFlag(args, "--vocab", 1) or "" body_path = tkrzw_dict.GetCommandFlag(args, "--body", 1) or "" phrase_path = tkrzw_dict.GetCommandFlag(args, "--phrase", 1) or "" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "" num_extra_items = int(tkrzw_dict.GetCommandFlag(args, "--extra_items", 1) or 0) num_section_clusters = int(tkrzw_dict.GetCommandFlag(args, "--section_clusters", 1) or 1) child_min_prob = float(tkrzw_dict.GetCommandFlag(args, "--child_min_prob", 1) or 0) title = tkrzw_dict.GetCommandFlag(args, "--title", 1) or "連想英単語帳" if not vocab_path: raise RuntimeError("the vocab path is required") if not body_path: raise RuntimeError("the body path is required") if not phrase_path: raise RuntimeError("the phrase path is required") if not output_path: raise RuntimeError("the output path is required") GenerateUnionVocabBatch(vocab_path, body_path, phrase_path, output_path, num_extra_items, num_section_clusters, child_min_prob, title).Run()
def main(): args = sys.argv[1:] num_clusters = int(tkrzw_dict.GetCommandFlag(args, "--clusters", 1) or 500) num_rounds = int(tkrzw_dict.GetCommandFlag(args, "--rounds", 1) or 100) num_items = int(tkrzw_dict.GetCommandFlag(args, "--items", 1) or 10000) num_item_features = int(tkrzw_dict.GetCommandFlag(args, "--item_features", 1) or 40) num_cluster_features = int(tkrzw_dict.GetCommandFlag(args, "--cluster_features", 1) or 160) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) ClusterBatch(num_clusters, num_rounds, num_items, num_item_features, num_cluster_features).Run()
def main(): args = sys.argv[1:] language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en" lowering = tkrzw_dict.GetCommandFlag(args, "--lower", 0) stemming = tkrzw_dict.GetCommandFlag(args, "--stem", 0) max_sentences = int( tkrzw_dict.GetCommandFlag(args, "--max_sentences", 1) or "1000000") with_middle = tkrzw_dict.GetCommandFlag(args, "--middle", 0) with_readable = tkrzw_dict.GetCommandFlag(args, "--readable", 0) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) logger.info( ("Process started: language={}, lower={}, stem={}, max_sentences={}" ", middle={}, readable={}").format(language, lowering, stemming, max_sentences, with_middle, with_readable)) tokenizer = tkrzw_tokenizer.Tokenizer() count = 0 num_records, num_sentences, num_words = 0, 0, 0 for line in sys.stdin: line = line.strip() if not line: continue count += 1 stats = ProcessTSV(tokenizer, language, lowering, stemming, max_sentences, with_middle, with_readable, line) if stats: num_records += 1 num_sentences += stats[0] num_words += stats[1] if count % 1000 == 0: logger.info( "Processing: {} input records, {} output records, {} sentences, {} words" .format(count, num_records, num_sentences, num_words)) logger.info( "Process done: {} input records, {} output records, {} sentences, {} words" .format(count, num_records, num_sentences, num_words))
def main(): args = sys.argv[1:] feature_path = tkrzw_dict.GetCommandFlag(args, "--feature", 1) or "" cluster_path = tkrzw_dict.GetCommandFlag(args, "--cluster", 1) or "" num_total_words = float( tkrzw_dict.GetCommandFlag(args, "--total_words", 1) or 100000) num_item_features = int( tkrzw_dict.GetCommandFlag(args, "--item_features", 1) or 32) num_cluster_features = int( tkrzw_dict.GetCommandFlag(args, "--cluster_features", 1) or 128) num_extra_items = int( tkrzw_dict.GetCommandFlag(args, "--extra_items", 1) or 15) if not feature_path: raise RuntimeError("the feature path is required") if not cluster_path: raise RuntimeError("the cluster path is required") ClassifyBatch(feature_path, cluster_path, num_total_words, num_item_features, num_cluster_features, num_extra_items).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-jaen-kindle" supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(",")) tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or "" phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob", 1) or "" rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or "" yomi_first_paths = (tkrzw_dict.GetCommandFlag(args, "--yomi_first", 1) or "").split(",") yomi_second_paths = (tkrzw_dict.GetCommandFlag(args, "--yomi_second", 1) or "").split(",") tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",") conj_verb_path = tkrzw_dict.GetCommandFlag(args, "--conj_verb", 1) conj_adj_path = tkrzw_dict.GetCommandFlag(args, "--conj_adj", 1) title = tkrzw_dict.GetCommandFlag(args, "--title", 1) or "Union Japanese-English Dictionary" if not input_path: raise RuntimeError("an input path is required") if not output_path: raise RuntimeError("an output path is required") GenerateUnionEPUBBatch(input_path, output_path, supplement_labels, tran_prob_path, phrase_prob_path, rev_prob_path, yomi_first_paths, yomi_second_paths, tran_aux_paths, conj_verb_path, conj_adj_path, title).Run()
def main(): args = sys.argv[1:] input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh" output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-kindle" keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or "" best_labels = set((tkrzw_dict.GetCommandFlag(args, "--best", 1) or "xa").split(",")) vetted_labels = set((tkrzw_dict.GetCommandFlag(args, "--vetted", 1) or "wn").split(",")) preferable_labels = set((tkrzw_dict.GetCommandFlag( args, "--preferable", 1) or "xa,wn,ox,we").split(",")) trustable_labels = set((tkrzw_dict.GetCommandFlag( args, "--trustable", 1) or "xa").split(",")) supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(",")) title = tkrzw_dict.GetCommandFlag(args, "--title", 1) or "Union English-Japanese Dictionary" min_prob_normal = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_normal", 1) or 0.0000001) min_prob_capital = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_multi", 1) or 0.000001) min_prob_multi = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_capital", 1) or 0.000001) sufficient_prob = float(tkrzw_dict.GetCommandFlag(args, "--sufficient_prob", 1) or 0.00001) shrink = tkrzw_dict.GetCommandFlag(args, "--shrink", 0) if not input_path: raise RuntimeError("an input path is required") if not output_path: raise RuntimeError("an output path is required") GenerateUnionEPUBBatch( input_path, output_path, keyword_path, best_labels, vetted_labels, preferable_labels, trustable_labels, supplement_labels, title, min_prob_normal, min_prob_capital, min_prob_multi, sufficient_prob, shrink).Run()