def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input",
                                           1) or "union-body.tkh"
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    ExtractKeysBatch(input_path).Run()
Exemple #2
0
def main():
    args = sys.argv[1:]
    data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix",
                                            1) or "result"
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    DivideCountBatch(data_prefix).Run()
def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input",
                                           1) or "union-body.tkh"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "union-dict-epub"
    if not input_path:
        raise RuntimeError("an input path is required")
    if not output_path:
        raise RuntimeError("an output path is required")
    GenerateUnionEPUBBatch(input_path, output_path).Run()
def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input",
                                           1) or "wordnet-body.tkh"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "wordnet-tran-index.tkh"
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    IndexTranslationsBatch(input_path, output_path).Run()
Exemple #5
0
def main():
    args = sys.argv[1:]
    dict_dir = tkrzw_dict.GetCommandFlag(args, "--dict", 1) or "dict"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "wordnet.tkh"
    prob_path = tkrzw_dict.GetCommandFlag(args, "--prob", 1) or ""
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    BuildWordNetDBBatch(dict_dir, output_path, prob_path).Run()
Exemple #6
0
def main():
  args = sys.argv[1:]
  input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh"
  output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-epub"
  min_prob = float(tkrzw_dict.GetCommandFlag(args, "--min_prob", 1) or 0)
  multi_min_prob = float(tkrzw_dict.GetCommandFlag(args, "--multi_min_prob", 1) or 0.00002)
  if not input_path:
    raise RuntimeError("an input path is required")
  if not output_path:
    raise RuntimeError("an output path is required")
  GenerateUnionEPUBBatch(input_path, output_path, min_prob, multi_min_prob).Run()
Exemple #7
0
def main():
  args = sys.argv[1:]
  rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""
  min_count = int(tkrzw_dict.GetCommandFlag(args, "--min_count", 1) or 10)
  enough_ef = float(tkrzw_dict.GetCommandFlag(args, "--enough_ef", 1) or 2.0)
  enough_fe = float(tkrzw_dict.GetCommandFlag(args, "--enough_fe", 1) or 2.0)
  omit_latin = tkrzw_dict.GetCommandFlag(args, "--omit_latin", 0)
  min_score = float(tkrzw_dict.GetCommandFlag(args, "--min_score", 1) or 0.25)
  min_score_large = float(tkrzw_dict.GetCommandFlag(args, "--min_score_large", 1) or 0.35)
  min_score_stop = float(tkrzw_dict.GetCommandFlag(args, "--min_score_stop", 1) or 0.3)
  max_targets = int(tkrzw_dict.GetCommandFlag(args, "--max_targets", 1) or 8)
  tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",")
  Run(rev_prob_path, min_count, enough_ef, enough_fe, omit_latin,
      min_score, min_score_large, min_score_stop, max_targets, tran_aux_paths)
Exemple #8
0
def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input",
                                           1) or "union-body.tkh"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "union-tran-index.tkh"
    supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1)
                             or "xs").split(","))
    tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    IndexTranslationsBatch(input_path, output_path, supplement_labels,
                           tran_prob_path).Run()
Exemple #9
0
def main():
    args = sys.argv[1:]
    data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix",
                                            1) or "result"
    language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en"
    text = " ".join(args)
    if not text:
        raise RuntimeError("words are not specified")
    predictor = tkrzw_related_word_predictor.RelatedWordsPredictor(
        data_prefix, language)
    rel_words, features = predictor.Predict(text)
    print("==== FEATURES ====")
    for feat_word, feat_score in features[:16]:
        print("{} = {:.2f}".format(feat_word, feat_score))
    print()
    print("==== RELATED WORDS ====")
    for rel_word, rel_score in rel_words[:32]:
        print("{} = {:.4f}".format(rel_word, rel_score))
Exemple #10
0
def main():
  args = sys.argv[1:]
  sampling_ratio = float(tkrzw_dict.GetCommandFlag(args, "--sampling", 1) or 1.0)
  max_outputs = int(tkrzw_dict.GetCommandFlag(args, "--max", 1) or sys.maxsize)
  if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
    logger.setLevel(logging.ERROR)
  if args:
    raise RuntimeError("unknown arguments: {}".format(str(args)))
  if sampling_ratio <= 0 or sampling_ratio > 1:
    raise ValueError("invalid sampling ratio")
  if max_outputs < 0:
    raise ValueError("invalid max outputs")
  logger.info("Process started")
  parser = xml.sax.make_parser()
  handler = XMLHandler(sampling_ratio, max_outputs)
  parser.setContentHandler(handler)
  try:
    parser.parse(sys.stdin)
  except xml.sax.SAXException:
    pass
  logger.info("Process done")
Exemple #11
0
def main():
    args = sys.argv[1:]
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    logger.info("Process started")
    parser = xml.sax.make_parser()
    handler = XMLHandler()
    parser.setContentHandler(handler)
    parser.parse(sys.stdin)
    print(handler.getCount(), flush=True)
    logger.info("Process done")
Exemple #12
0
def main():
    args = sys.argv[1:]
    data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix",
                                            1) or "wordnet"
    direction = tkrzw_dict.GetCommandFlag(args, "--direction", 1) or "auto"
    show_details = tkrzw_dict.GetCommandFlag(args, "--details", 0)
    text = " ".join(args)
    if not text:
        raise RuntimeError("words are not specified")
    reverse = False
    if direction == "auto":
        reverse = tkrzw_dict.PredictLanguage(text) != "en"
    elif direction == "reverse":
        reverse = True
    searcher = tkrzw_wordnet_searcher.WordNetSearcher(data_prefix)
    if reverse:
        result = searcher.SearchReverse(text)
    else:
        result = searcher.SearchExact(text)
    if result:
        for key, entry in result:
            PrintResultWord(key, entry, show_details)
    else:
        print("No result.")
def main():
    args = sys.argv[1:]
    phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob",
                                                 1) or ""
    rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""
    tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""
    tran_aux_paths = tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or ""
    yomi_paths = tkrzw_dict.GetCommandFlag(args, "--yomi", 1) or ""
    min_phrase_prob = float(
        tkrzw_dict.GetCommandFlag(args, "--min_phrase_prob", 1) or .000001)
    min_tran_prob = float(
        tkrzw_dict.GetCommandFlag(args, "--min_tran_prob", 1) or 0.1)
    Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths,
        yomi_paths, min_phrase_prob, min_tran_prob)
Exemple #14
0
def main():
    args = sys.argv[1:]
    data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix",
                                            1) or "result-para"
    keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or ""
    dict_path = tkrzw_dict.GetCommandFlag(args, "--dict", 1) or ""
    thes_path = tkrzw_dict.GetCommandFlag(args, "--thes", 1) or ""
    source_ngram = int(
        tkrzw_dict.GetCommandFlag(args, "--source_ngram", 1) or "3")
    target_ngram = int(
        tkrzw_dict.GetCommandFlag(args, "--target_ngram", 1) or "3")
    target_stem = tkrzw_dict.GetCommandFlag(args, "--target_stem", 0)
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    WordCountBatch(data_prefix, keyword_path, dict_path, thes_path,
                   source_ngram, target_ngram, target_stem).Run()
Exemple #15
0
def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "wordnet.thk"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "wordnet-body.tkh"
    wnjpn_path = tkrzw_dict.GetCommandFlag(args, "--wnjpn",
                                           1) or "wnjpn-ok.tab"
    phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob",
                                                 1) or ""
    rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""
    tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""
    tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1)
                      or "").split(",")
    tran_subaux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_subaux", 1)
                         or "").split(",")
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    AppendWordnetJPNBatch(input_path, output_path, wnjpn_path,
                          phrase_prob_path, rev_prob_path, tran_prob_path,
                          tran_aux_paths, tran_subaux_paths).Run()
Exemple #16
0
def main():
  args = sys.argv[1:]
  data_prefix = tkrzw_dict.GetCommandFlag(args, "--data_prefix", 1) or "result-para"
  min_count = int(tkrzw_dict.GetCommandFlag(args, "--min_count", 1) or 2)
  min_score = float(tkrzw_dict.GetCommandFlag(args, "--min_score", 1) or 0.01)
  enough_count = int(tkrzw_dict.GetCommandFlag(args, "--enough_count", 1) or 10)
  base_count = int(tkrzw_dict.GetCommandFlag(args, "--base_count", 1) or 2)
  if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
    logger.setLevel(logging.ERROR)
  if args:
    raise RuntimeError("unknown arguments: {}".format(str(args)))
  ExtractTransBatch(data_prefix, min_count, min_score, enough_count, base_count).Run()
def main():
  args = sys.argv[1:]
  vocab_path = tkrzw_dict.GetCommandFlag(args, "--vocab", 1) or ""
  body_path = tkrzw_dict.GetCommandFlag(args, "--body", 1) or ""
  phrase_path = tkrzw_dict.GetCommandFlag(args, "--phrase", 1) or ""
  output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or ""
  num_extra_items = int(tkrzw_dict.GetCommandFlag(args, "--extra_items", 1) or 0)
  num_section_clusters = int(tkrzw_dict.GetCommandFlag(args, "--section_clusters", 1) or 1)
  child_min_prob = float(tkrzw_dict.GetCommandFlag(args, "--child_min_prob", 1) or 0)
  title = tkrzw_dict.GetCommandFlag(args, "--title", 1) or "連想英単語帳"
  if not vocab_path:
    raise RuntimeError("the vocab path is required")
  if not body_path:
    raise RuntimeError("the body path is required")
  if not phrase_path:
    raise RuntimeError("the phrase path is required")
  if not output_path:
    raise RuntimeError("the output path is required")
  GenerateUnionVocabBatch(vocab_path, body_path, phrase_path, output_path,
                          num_extra_items, num_section_clusters, child_min_prob, title).Run()
Exemple #18
0
def main():
  args = sys.argv[1:]
  num_clusters = int(tkrzw_dict.GetCommandFlag(args, "--clusters", 1) or 500)
  num_rounds = int(tkrzw_dict.GetCommandFlag(args, "--rounds", 1) or 100)
  num_items = int(tkrzw_dict.GetCommandFlag(args, "--items", 1) or 10000)
  num_item_features = int(tkrzw_dict.GetCommandFlag(args, "--item_features", 1) or 40)
  num_cluster_features = int(tkrzw_dict.GetCommandFlag(args, "--cluster_features", 1) or 160)
  if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
    logger.setLevel(logging.ERROR)
  if args:
    raise RuntimeError("unknown arguments: {}".format(str(args)))
  ClusterBatch(num_clusters, num_rounds,
               num_items, num_item_features, num_cluster_features).Run()
Exemple #19
0
def main():
    args = sys.argv[1:]
    language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en"
    lowering = tkrzw_dict.GetCommandFlag(args, "--lower", 0)
    stemming = tkrzw_dict.GetCommandFlag(args, "--stem", 0)
    max_sentences = int(
        tkrzw_dict.GetCommandFlag(args, "--max_sentences", 1) or "1000000")
    with_middle = tkrzw_dict.GetCommandFlag(args, "--middle", 0)
    with_readable = tkrzw_dict.GetCommandFlag(args, "--readable", 0)
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    logger.info(
        ("Process started: language={}, lower={}, stem={}, max_sentences={}"
         ", middle={}, readable={}").format(language, lowering, stemming,
                                            max_sentences, with_middle,
                                            with_readable))
    tokenizer = tkrzw_tokenizer.Tokenizer()
    count = 0
    num_records, num_sentences, num_words = 0, 0, 0
    for line in sys.stdin:
        line = line.strip()
        if not line: continue
        count += 1
        stats = ProcessTSV(tokenizer, language, lowering, stemming,
                           max_sentences, with_middle, with_readable, line)
        if stats:
            num_records += 1
            num_sentences += stats[0]
            num_words += stats[1]
        if count % 1000 == 0:
            logger.info(
                "Processing: {} input records, {} output records, {} sentences, {} words"
                .format(count, num_records, num_sentences, num_words))
    logger.info(
        "Process done: {} input records, {} output records, {} sentences, {} words"
        .format(count, num_records, num_sentences, num_words))
def main():
    args = sys.argv[1:]
    feature_path = tkrzw_dict.GetCommandFlag(args, "--feature", 1) or ""
    cluster_path = tkrzw_dict.GetCommandFlag(args, "--cluster", 1) or ""
    num_total_words = float(
        tkrzw_dict.GetCommandFlag(args, "--total_words", 1) or 100000)
    num_item_features = int(
        tkrzw_dict.GetCommandFlag(args, "--item_features", 1) or 32)
    num_cluster_features = int(
        tkrzw_dict.GetCommandFlag(args, "--cluster_features", 1) or 128)
    num_extra_items = int(
        tkrzw_dict.GetCommandFlag(args, "--extra_items", 1) or 15)
    if not feature_path:
        raise RuntimeError("the feature path is required")
    if not cluster_path:
        raise RuntimeError("the cluster path is required")
    ClassifyBatch(feature_path, cluster_path, num_total_words,
                  num_item_features, num_cluster_features,
                  num_extra_items).Run()
def main():
    args = sys.argv[1:]
    input_path = tkrzw_dict.GetCommandFlag(args, "--input",
                                           1) or "union-body.tkh"
    output_path = tkrzw_dict.GetCommandFlag(args, "--output",
                                            1) or "union-dict-jaen-kindle"
    supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1)
                             or "xs").split(","))
    tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""
    phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob",
                                                 1) or ""
    rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""
    yomi_first_paths = (tkrzw_dict.GetCommandFlag(args, "--yomi_first", 1)
                        or "").split(",")
    yomi_second_paths = (tkrzw_dict.GetCommandFlag(args, "--yomi_second", 1)
                         or "").split(",")
    tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1)
                      or "").split(",")
    conj_verb_path = tkrzw_dict.GetCommandFlag(args, "--conj_verb", 1)
    conj_adj_path = tkrzw_dict.GetCommandFlag(args, "--conj_adj", 1)
    title = tkrzw_dict.GetCommandFlag(args, "--title",
                                      1) or "Union Japanese-English Dictionary"
    if not input_path:
        raise RuntimeError("an input path is required")
    if not output_path:
        raise RuntimeError("an output path is required")
    GenerateUnionEPUBBatch(input_path, output_path, supplement_labels,
                           tran_prob_path, phrase_prob_path, rev_prob_path,
                           yomi_first_paths, yomi_second_paths, tran_aux_paths,
                           conj_verb_path, conj_adj_path, title).Run()
Exemple #22
0
def main():
  args = sys.argv[1:]
  input_path = tkrzw_dict.GetCommandFlag(args, "--input", 1) or "union-body.tkh"
  output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-dict-kindle"
  keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or ""
  best_labels = set((tkrzw_dict.GetCommandFlag(args, "--best", 1) or "xa").split(","))
  vetted_labels = set((tkrzw_dict.GetCommandFlag(args, "--vetted", 1) or "wn").split(","))
  preferable_labels = set((tkrzw_dict.GetCommandFlag(
    args, "--preferable", 1) or "xa,wn,ox,we").split(","))
  trustable_labels = set((tkrzw_dict.GetCommandFlag(
    args, "--trustable", 1) or "xa").split(","))
  supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(","))
  title = tkrzw_dict.GetCommandFlag(args, "--title", 1) or "Union English-Japanese Dictionary"
  min_prob_normal = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_normal", 1) or 0.0000001)
  min_prob_capital = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_multi", 1) or 0.000001)
  min_prob_multi = float(tkrzw_dict.GetCommandFlag(args, "--min_prob_capital", 1) or 0.000001)
  sufficient_prob = float(tkrzw_dict.GetCommandFlag(args, "--sufficient_prob", 1) or 0.00001)
  shrink = tkrzw_dict.GetCommandFlag(args, "--shrink", 0)
  if not input_path:
    raise RuntimeError("an input path is required")
  if not output_path:
    raise RuntimeError("an output path is required")
  GenerateUnionEPUBBatch(
    input_path, output_path, keyword_path,
    best_labels, vetted_labels, preferable_labels, trustable_labels, supplement_labels,
    title, min_prob_normal, min_prob_capital, min_prob_multi, sufficient_prob, shrink).Run()