Ejemplo n.º 1
0
 def __init__(self, input_path, output_path, supplement_labels,
              tran_prob_path):
     self.input_path = input_path
     self.output_path = output_path
     self.supplement_labels = supplement_labels
     self.tran_prob_path = tran_prob_path
     self.tokenizer = tkrzw_tokenizer.Tokenizer()
Ejemplo n.º 2
0
 def Run(self):
     tokenizer = tkrzw_tokenizer.Tokenizer()
     start_time = time.time()
     logger.info(
         "Process started: input_path={}, output_path={}, wnjpn_path={}".
         format(self.input_path, self.output_path, self.wnjpn_path))
     wnjpn_trans = self.ReadTranslations()
     aux_trans, subaux_trans = self.ReadAuxTranslations()
     synset_index = self.ReadSynsetIndex()
     self.AppendTranslations(wnjpn_trans, aux_trans, subaux_trans,
                             synset_index)
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Ejemplo n.º 3
0
 def Run(self):
   tokenizer = tkrzw_tokenizer.Tokenizer()
   start_time = time.time()
   logger.info("Process started: input_path={}, output_path={}, wnjpn_path={}".format(
                 self.input_path, self.output_path, self.wnjpn_path))
   wnjpn_trans = self.ReadTranslations()
   if self.feedback_path:
     feedback_trans = self.ReadFeedbackTranslations()
   else:
     feedback_trans = None
   aux_trans, subaux_trans, tran_thes = self.ReadAuxTranslations()
   synset_index = self.ReadSynsetIndex()
   tran_index = {}
   tran_index = self.ReadTranIndex(synset_index)
   self.AppendTranslations(
     wnjpn_trans, feedback_trans, aux_trans, subaux_trans, tran_thes, synset_index, tran_index)
   logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
 def __init__(self, input_path, output_path, supplement_labels,
              tran_prob_path, phrase_prob_path, rev_prob_path,
              yomi_first_paths, yomi_second_paths, tran_aux_paths,
              conj_verb_path, conj_adj_path, title):
     self.input_path = input_path
     self.output_path = output_path
     self.supplement_labels = supplement_labels
     self.tran_prob_path = tran_prob_path
     self.phrase_prob_path = phrase_prob_path
     self.rev_prob_path = rev_prob_path
     self.yomi_first_paths = yomi_first_paths
     self.yomi_second_paths = yomi_second_paths
     self.tran_aux_paths = tran_aux_paths
     self.conj_verb_path = conj_verb_path
     self.conj_adj_path = conj_adj_path
     self.title = title
     self.tokenizer = tkrzw_tokenizer.Tokenizer()
     self.num_words = 0
     self.num_items = 0
Ejemplo n.º 5
0
def main():
    args = sys.argv[1:]
    language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en"
    lowering = tkrzw_dict.GetCommandFlag(args, "--lower", 0)
    stemming = tkrzw_dict.GetCommandFlag(args, "--stem", 0)
    max_sentences = int(
        tkrzw_dict.GetCommandFlag(args, "--max_sentences", 1) or "1000000")
    with_middle = tkrzw_dict.GetCommandFlag(args, "--middle", 0)
    with_readable = tkrzw_dict.GetCommandFlag(args, "--readable", 0)
    if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):
        logger.setLevel(logging.ERROR)
    if args:
        raise RuntimeError("unknown arguments: {}".format(str(args)))
    logger.info(
        ("Process started: language={}, lower={}, stem={}, max_sentences={}"
         ", middle={}, readable={}").format(language, lowering, stemming,
                                            max_sentences, with_middle,
                                            with_readable))
    tokenizer = tkrzw_tokenizer.Tokenizer()
    count = 0
    num_records, num_sentences, num_words = 0, 0, 0
    for line in sys.stdin:
        line = line.strip()
        if not line: continue
        count += 1
        stats = ProcessTSV(tokenizer, language, lowering, stemming,
                           max_sentences, with_middle, with_readable, line)
        if stats:
            num_records += 1
            num_sentences += stats[0]
            num_words += stats[1]
        if count % 1000 == 0:
            logger.info(
                "Processing: {} input records, {} output records, {} sentences, {} words"
                .format(count, num_records, num_sentences, num_words))
    logger.info(
        "Process done: {} input records, {} output records, {} sentences, {} words"
        .format(count, num_records, num_sentences, num_words))
Ejemplo n.º 6
0
 def __init__(self, input_path, output_path, keyword_path,
              best_labels, vetted_labels, preferable_labels, trustable_labels,
              supplement_labels, title,
              min_prob_normal, min_prob_capital, min_prob_multi, sufficient_prob, shrink):
   self.input_path = input_path
   self.output_path = output_path
   self.keyword_path = keyword_path
   self.best_labels = best_labels
   self.vetted_labels = vetted_labels
   self.preferable_labels = preferable_labels
   self.trustable_labels = trustable_labels
   self.supplement_labels = supplement_labels
   self.title = title
   self.min_prob_normal = min_prob_normal
   self.min_prob_capital = min_prob_capital
   self.min_prob_multi = min_prob_multi
   self.sufficient_prob = sufficient_prob
   self.shrink = shrink
   self.num_words = 0
   self.num_trans = 0
   self.num_items = 0
   self.num_aux_items = 0
   self.label_counters = collections.defaultdict(int)
   self.tokenizer = tkrzw_tokenizer.Tokenizer()
Ejemplo n.º 7
0
 def AppendTranslations(self, wnjpn_trans, aux_trans, subaux_trans,
                        synset_index):
     start_time = time.time()
     logger.info(
         "Appending translations: input_path={}, output_path={}".format(
             self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     phrase_prob_dbm = None
     if self.phrase_prob_path:
         phrase_prob_dbm = tkrzw.DBM()
         phrase_prob_dbm.Open(self.phrase_prob_path, False,
                              dbm="HashDBM").OrDie()
     rev_prob_dbm = None
     if self.rev_prob_path:
         rev_prob_dbm = tkrzw.DBM()
         rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie()
     tokenizer = tkrzw_tokenizer.Tokenizer()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     output_dbm = tkrzw.DBM()
     num_buckets = input_dbm.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     num_words = 0
     num_orig_trans = 0
     num_match_trans = 0
     num_voted_trans = 0
     num_items = 0
     num_items_bare = 0
     num_items_rescued = 0
     it = input_dbm.MakeIterator()
     it.First()
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         items = entry["item"]
         spell_ratios = {}
         for item in items:
             word = item["word"]
             phrase_prob = float(item.get("prob") or 0.0)
             spell_ratios[word] = phrase_prob + 0.00000001
         sum_prob = 0.0
         for word, prob in spell_ratios.items():
             sum_prob += prob
         for word, prob in list(spell_ratios.items()):
             spell_ratios[word] = prob / sum_prob
         for item in items:
             word = item["word"]
             pos = item["pos"]
             synset = item["synset"]
             links = item.get("link") or {}
             phrase_prob = float(item.get("prob") or 0.0)
             spell_ratio = spell_ratios[word]
             synonyms = item.get("synonym") or []
             hypernyms = item.get("hypernym") or []
             hyponyms = item.get("hyponym") or []
             similars = item.get("similar") or []
             derivatives = item.get("derivative") or []
             synonym_ids = links.get("synonym") or []
             hypernym_ids = links.get("hypernym") or []
             hyponym_ids = links.get("hyponym") or []
             similar_ids = links.get("similar") or []
             derivative_ids = links.get("derivative") or []
             item_tran_pairs = wnjpn_trans.get(synset) or []
             item_aux_trans = aux_trans.get(word) or []
             item_aux_trans.extend(subaux_trans.get(word) or [])
             self.NormalizeTranslationList(tokenizer, pos, item_aux_trans)
             item_trans = []
             hand_trans = set()
             for tran, src in item_tran_pairs:
                 if src == "mono":
                     hit = False
                     for item_aux_tran in item_aux_trans:
                         dist = tkrzw.Utility.EditDistanceLev(
                             tran, item_aux_tran)
                         dist_ratio = dist / max(len(tran),
                                                 len(item_aux_tran))
                         if dist < 0.3:
                             hit = True
                     if not hit:
                         continue
                 item_trans.append(tran)
                 if src == "hand":
                     hand_trans.add(tran)
             self.NormalizeTranslationList(tokenizer, pos, item_trans)
             num_items += 1
             bare = not item_trans
             if bare:
                 num_items_bare += 1
             num_orig_trans += len(item_trans)
             syno_tran_counts = collections.defaultdict(int)
             hyper_tran_counts = collections.defaultdict(int)
             hypo_tran_counts = collections.defaultdict(int)
             similar_tran_counts = collections.defaultdict(int)
             derivative_tran_counts = collections.defaultdict(int)
             aux_trans_set = set(item_aux_trans)
             checked_words = set()
             checked_ids = set([synset])
             voted_rel_words = set()
             voted_rel_records = set()
             for rel_words, rel_ids, tran_counts in (
                 (synonyms, synonym_ids, syno_tran_counts),
                 (hypernyms, hypernym_ids, hyper_tran_counts),
                 (hyponyms, hyponym_ids, hypo_tran_counts),
                 (similars, similar_ids, similar_tran_counts),
                 (derivatives, derivative_ids, derivative_tran_counts)):
                 for rel_word in rel_words:
                     is_similar = self.AreSimilarWords(rel_word, word)
                     rel_phrase_prob = 0.0
                     if phrase_prob_dbm:
                         rel_phrase_prob = self.GetPhraseProb(
                             phrase_prob_dbm, tokenizer, "en", rel_word)
                     mean_prob = (phrase_prob * rel_phrase_prob)**0.5
                     rel_aux_trans = []
                     if rel_word not in checked_words:
                         checked_words.add(rel_word)
                         tmp_aux_trans = aux_trans.get(rel_word)
                         if tmp_aux_trans:
                             rel_aux_trans.extend(tmp_aux_trans)
                     for rel_id in synset_index[rel_word]:
                         if rel_id not in rel_ids: continue
                         if rel_id not in checked_ids:
                             checked_ids.add(rel_id)
                             tmp_aux_trans = wnjpn_trans.get(rel_id)
                             if tmp_aux_trans:
                                 tmp_aux_trans = [
                                     x[0] for x in tmp_aux_trans
                                 ]
                                 rel_aux_trans.extend(tmp_aux_trans)
                     if rel_aux_trans:
                         self.NormalizeTranslationList(
                             tokenizer, pos, rel_aux_trans)
                         if not is_similar and mean_prob < 0.0005:
                             for item_aux_tran in item_aux_trans:
                                 if regex.fullmatch(r"[\p{Hiragana}]{,3}",
                                                    item_aux_tran):
                                     continue
                                 if item_aux_tran in rel_aux_trans:
                                     valid_pos = self.IsValidPosTran(
                                         tokenizer, pos, item_aux_tran)
                                     if valid_pos and item_aux_tran not in item_trans:
                                         item_trans.append(item_aux_tran)
                                         num_match_trans += 1
                         if mean_prob < 0.005:
                             voted_top = rel_word
                             for voted_rel_word in voted_rel_words:
                                 if self.AreSimilarWords(
                                         rel_word, voted_rel_word):
                                     voted_top = voted_rel_word
                                     break
                             voted_rel_words.add(rel_word)
                             for rel_aux_tran in set(rel_aux_trans):
                                 voted_record = (voted_top, rel_aux_tran)
                                 if voted_record in voted_rel_records:
                                     continue
                                 voted_rel_records.add(voted_record)
                                 tran_counts[rel_aux_tran] += 1
             if bare:
                 for deri_tran, count in derivative_tran_counts.items():
                     syno_tran_counts[
                         deri_tran] = syno_tran_counts[deri_tran] + count
                 derivative_tran_counts.clear()
             for syno_tran, count in syno_tran_counts.items():
                 if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran):
                     continue
                 if syno_tran in hyper_tran_counts: count += 1
                 if syno_tran in hypo_tran_counts: count += 1
                 if syno_tran in similar_tran_counts: count += 1
                 if syno_tran in derivative_tran_counts: count += 1
                 if bare and syno_tran in aux_trans_set: count += 1
                 if count >= 3 and syno_tran not in item_trans:
                     valid_pos = self.IsValidPosTran(
                         tokenizer, pos, syno_tran)
                     if valid_pos and syno_tran not in item_trans:
                         item_trans.append(syno_tran)
                         num_voted_trans += 1
             item_score = 0.0
             if item_trans:
                 if bare:
                     num_items_rescued += 1
                 if rev_prob_dbm or tran_prob_dbm:
                     item_trans, item_score, tran_scores = (
                         self.SortWordsByScore(word, item_trans, hand_trans,
                                               rev_prob_dbm, tokenizer,
                                               tran_prob_dbm))
                 item[
                     "translation"] = item_trans[:MAX_TRANSLATIONS_PER_WORD]
                 if tran_scores:
                     tran_score_map = {}
                     for tran, tran_score in tran_scores[:
                                                         MAX_TRANSLATIONS_PER_WORD]:
                         tran_score_map[tran] = "{:.6f}".format(
                             tran_score).replace("0.", ".")
                     item["translation_score"] = tran_score_map
             item_score += spell_ratio * 0.5
             item["score"] = "{:.8f}".format(item_score).replace("0.", ".")
             if "link" in item:
                 del item["link"]
         if rev_prob_dbm:
             entry["item"] = sorted(
                 items,
                 key=lambda item: float(item.get("score") or 0.0),
                 reverse=True)
         serialized = json.dumps(entry,
                                 separators=(",", ":"),
                                 ensure_ascii=False)
         output_dbm.Set(key, serialized).OrDie()
         num_words += 1
         if num_words % 10000 == 0:
             logger.info("Saving words: words={}".format(num_words))
         it.Next()
     output_dbm.Close().OrDie()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     if rev_prob_dbm:
         rev_prob_dbm.Close().OrDie()
     if phrase_prob_dbm:
         phrase_prob_dbm.Close().OrDie()
     input_dbm.Close().OrDie()
     logger.info(
         "Aappending translations done: words={}, elapsed_time={:.2f}s".
         format(num_words,
                time.time() - start_time))
     logger.info(
         "Stats: orig={}, match={}, voted={}, items={}, bare={}, rescued={}"
         .format(num_orig_trans, num_match_trans, num_voted_trans,
                 num_items, num_items_bare, num_items_rescued))
def main():
  args = sys.argv[1:]
  if len(args) < 1:
    raise ValueError("invalid arguments")
  input_path = args[0]
  is_synset = False
  for arg in args[1:]:
    if arg == "--synset":
      is_synset = True
    else:
      raise ValueError("invalid arguments")
  tokenizer = tkrzw_tokenizer.Tokenizer()
  dbm = tkrzw.DBM()
  dbm.Open(input_path, False).OrDie()
  it = dbm.MakeIterator()
  it.First().OrDie()
  while True:
    record = it.GetStr()
    if not record: break;
    key, data = record
    entries = json.loads(data)
    for entry in entries:
      word = entry["word"]
      if is_synset:
        for item in entry["item"]:
          pos = item["pos"]
          text = item["text"]
          syn_match = regex.search(r"\[synset\]: ([-0-9a-z]+)", text)
          tran_match = regex.search(r"\[translation\]: ([^\[]+)", text)
          if syn_match and tran_match:
            syn = syn_match.group(1)
            tran = tran_match.group(1)
            tran = regex.sub(r"\([^)]+\)", "", tran)
            norm_trans = []
            uniq_trans = set()
            for syn_tran in tran.split(","):
              norm_tran = tokenizer.NormalizeJaWordForPos(pos, syn_tran.strip())
              if norm_tran and norm_tran not in uniq_trans:
                norm_trans.append(norm_tran)
                uniq_trans.add(norm_tran)
            if norm_trans:
              print("{}:{}\t{}".format(word, syn, "\t".join(norm_trans)))
      else:
        poses = set()
        tran_poses = {}
        for item in entry["item"]:
          pos = item["pos"]
          text = item["text"]
          poses.add(pos)
          tran_match = regex.search(r"\[translation\]: ([^\[]+)", text)
          if tran_match:
            tran = tran_match.group(1)
            tran = regex.sub(r"\([^)]+\)", "", tran)
            for syn_tran in tran.split(","):
              syn_tran = syn_tran.strip()
              if syn_tran and syn_tran not in tran_poses:
                tran_poses[syn_tran] = pos
        only_pos = list(poses)[0] if len(poses) == 1 else None
        translations = entry.get("translation")
        if translations:
          norm_trans = []
          uniq_trans = set()
          for tran in translations:
            pos = only_pos
            if not pos:
              pos = tran_poses.get(tran)
            norm_tran = tokenizer.NormalizeJaWordForPos(pos, tran) if pos else tran
            if norm_tran and norm_tran not in uniq_trans:
              norm_trans.append(norm_tran)
              uniq_trans.add(norm_tran)
          if norm_trans:
            print("{}\t{}".format(word, "\t".join(norm_trans)))
    it.Next()
  dbm.Close().OrDie()
 def __init__(self, input_path, output_path, rev_prob_path):
   self.input_path = input_path
   self.rev_prob_path = rev_prob_path
   self.output_path = output_path
   self.tokenizer = tkrzw_tokenizer.Tokenizer()
Ejemplo n.º 10
0
    for target, score, ef_prob, fe_prob in good_targets[:max_targets]:
      if rev_prob_dbm:
        prob = GetPhraseProb(rev_prob_dbm, "ja", target)
        if prob < MIN_PROB:
          continue
      #outputs.append("{}:{:.3f}:{:.3f}:{:.3f}".format(target, score, ef_prob, fe_prob))
      outputs.append(target)
    if outputs:
      print("{}\t{}".format(source, "\t".join(outputs)))
  if rev_prob_dbm:
    rev_prob_dbm.Close().OrDie()
  logger.info("Process done: elapsed_time={:.2f}s".format(
    time.time() - start_time))


tokenizer = tkrzw_tokenizer.Tokenizer()
def GetPhraseProb(prob_dbm, language, word):
  base_prob = 0.000000001
  tokens = tokenizer.Tokenize(language, word, False, True)
  if not tokens: return base_prob
  max_ngram = min(3, len(tokens))
  fallback_penalty = 1.0
  for ngram in range(max_ngram, 0, -1):
    if len(tokens) <= ngram:
      cur_phrase = " ".join(tokens)
      prob = float(prob_dbm.GetStr(cur_phrase) or 0.0)
      if prob:
        return max(prob, base_prob)
      fallback_penalty *= 0.1
    else:
      probs = []
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths,
        yomi_paths, min_phrase_prob, min_tran_prob):
    logger.info("Start the process")
    phrase_prob_dbm = None
    if phrase_prob_path:
        logger.info("Opening the phrase prob DB: " + phrase_prob_path)
        phrase_prob_dbm = tkrzw.DBM()
        phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie()
    rev_prob_dbm = None
    if rev_prob_path:
        logger.info("Opening the reverse prob DB: " + rev_prob_path)
        rev_prob_dbm = tkrzw.DBM()
        rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie()
    tran_prob_dbm = None
    if tran_prob_path:
        logger.info("Opening the tran prob DB: " + tran_prob_path)
        tran_prob_dbm = tkrzw.DBM()
        tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie()
    aux_trans = collections.defaultdict(list)
    for tran_aux_path in tran_aux_paths.split(","):
        tran_aux_path = tran_aux_path.strip()
        if tran_aux_path:
            logger.info("Reading the tran aux file: " + tran_aux_path)
            with open(tran_aux_path) as input_file:
                uniq_keys = set()
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 2: continue
                    word = fields[0]
                    for tran in fields[1:]:
                        uniq_key = word + ":" + tran
                        if uniq_key in uniq_keys: continue
                        aux_trans[word].append(tran)
                        uniq_keys.add(uniq_key)
    yomis = set()
    for yomi_path in yomi_paths.split(","):
        yomi_path = yomi_path.strip()
        if yomi_path:
            logger.info("Reading the yomi file: " + yomi_path)
            with open(yomi_path) as input_file:
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 1: continue
                    yomis.add(fields[0])
    logger.info("Processing the gross.")
    tokenizer = tkrzw_tokenizer.Tokenizer()
    word_dict = collections.defaultdict(list)
    alt_source = None
    alt_targets = None
    num_lines = 0
    for line in sys.stdin:
        num_lines += 1
        if num_lines % 10000 == 0:
            logger.info("Processing the gross: {} lines: {} items".format(
                num_lines, len(word_dict)))
        fields = line.strip().split("\t")
        if len(fields) != 3: continue
        word, pos, text = fields
        if pos == "alternative":
            alt_source = word
            alt_targets = set()
            for alt in regex.split(r"[,;]", text):
                if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+",
                                   alt):
                    alt_targets.add(alt)
            continue
        text = regex.sub(r"\.$", "", text).strip()
        for tran in regex.split(r"[,;]", text):
            tran = tran.strip()
            if pos == "verb":
                tran = regex.sub(r"^to ", "", tran)
            if pos == "noun":
                tran = regex.sub(r"^(a|an|the) ", "", tran)
            tran = regex.sub("^[-~] ", "", tran)
            tran = regex.sub(" [-~]$", "", tran)
            if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue
            tokens = tran.split(" ")
            if len(tokens) < 1 or len(tokens) > 4: continue
            word_dict[tran].append((pos, word))
            if alt_source == word:
                for alt in alt_targets:
                    word_dict[tran].append((pos, alt))
    norm_word_dict = collections.defaultdict(list)
    for word, trans in word_dict.items():
        scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer,
                                                phrase_prob_dbm, rev_prob_dbm,
                                                tran_prob_dbm, aux_trans,
                                                yomis, min_phrase_prob,
                                                min_tran_prob)
        if scored_trans:
            key = tkrzw_dict.NormalizeWord(word)
            norm_word_dict[key].append((word, scored_trans, phrase_prob))
    for key, entries in norm_word_dict.items():
        sum_phrase_prob = 0.0
        for word, scored_trans, phrase_prob in entries:
            sum_phrase_prob += phrase_prob
        for word, scored_trans, phrase_prob in entries:
            if sum_phrase_prob > 0:
                if key == word:
                    if phrase_prob / sum_phrase_prob < 0.6: continue
                else:
                    if phrase_prob / sum_phrase_prob < 0.8: continue
            PrintEntry(word, scored_trans)
    if tran_prob_dbm:
        tran_prob_dbm.Close().OrDie()
    if phrase_prob_dbm:
        phrase_prob_dbm.Close().OrDie()
    logger.info("Process done")
 def __init__(self, data_prefix, language):
     self.language = language
     self.tokenizer = tkrzw_tokenizer.Tokenizer()
     word_score_path = tkrzw_dict.GetCoocScorePath(data_prefix)
     self.word_score_dbm = tkrzw.DBM()
     self.word_score_dbm.Open(word_score_path, False, dbm="HashDBM").OrDie()