def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}".format(self.input_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             infls = []
             for infl_name in inflection_names:
                 infl_value = word_entry.get(infl_name)
                 if infl_value:
                     for infl in infl_value.split(","):
                         infl = infl.strip()
                         if infl and infl != word and infl not in infls:
                             infls.append(infl_value)
             parents = word_entry.get("parent") or []
             children = word_entry.get("child") or []
             synonym_scores = collections.defaultdict(float)
             synonym_weight = 1.0
             for item in word_entry["item"]:
                 text = item["text"]
                 for part in text.split("[-]"):
                     part = part.strip()
                     match = regex.search(r"\[synonym\]: (.*)", part)
                     if match:
                         for synonym in match.group(1).split(","):
                             synonym = synonym.strip()
                             if synonym and synonym != word:
                                 synonym_scores[synonym] += synonym_weight
                                 synonym_weight *= 0.98
             synonym_scores = sorted(synonym_scores.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
             synonyms = [x[0] for x in synonym_scores]
             if not infls and not parents and not children and not synonyms:
                 continue
             print("{}\t{}\t{}\t{}\t{}".format(word, ",".join(infls),
                                               ",".join(parents),
                                               ",".join(children),
                                               ",".join(synonyms)))
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}".format(num_entries))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Exemple #2
0
 def DivideWordCount(self, word_count_path, word_prob_path):
     start_time = time.time()
     logger.info(
         "Writing the word probability database: src={}, dest={}".format(
             word_count_path, word_prob_path))
     word_count_dbm = tkrzw.DBM()
     word_count_dbm.Open(word_count_path, False, dbm="SkipDBM").OrDie()
     word_prob_dbm = tkrzw.DBM()
     num_buckets = word_count_dbm.Count() * 2
     word_prob_dbm.Open(word_prob_path,
                        True,
                        dbm="HashDBM",
                        truncate=True,
                        num_buckets=num_buckets).OrDie()
     it = word_count_dbm.MakeIterator()
     it.First()
     record = it.GetStr()
     if not record or len(record[0]) != 0:
         raise RuntimeError("invalid first record")
     num_sentences = int(record[1])
     it.Next()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record:
             break
         word = record[0]
         count = int(record[1])
         prob = count / num_sentences
         value = "{:.8f}".format(prob)
         value = regex.sub(r"^0\.", ".", value)
         word_prob_dbm.Set(word, value).OrDie()
         num_records += 1
         if num_records % 1000 == 0:
             logger.info(
                 "Dividing word counts: {} records".format(num_records))
         it.Next()
     word_prob_dbm.Close().OrDie()
     word_count_dbm.Close().OrDie()
     logger.info(
         "Writing the word probability database done: elapsed_time={:.2f}s".
         format(time.time() - start_time))
Exemple #3
0
 def Run(self):
   start_time = time.time()
   logger.info("Process started: data_prefix={}".format(self.data_prefix))
   phrase_count_path = "{}-count.tks".format(self.data_prefix)
   phrase_count_dbm = tkrzw.DBM()
   phrase_count_dbm.Open(phrase_count_path, False).OrDie()
   it = phrase_count_dbm.MakeIterator()
   it.First()
   record = it.GetStr()
   if not record or len(record[0]) != 0:
     raise RuntimeError("invalid first record")
   num_domains = int(record[1])
   it.Next()
   logger.info("Processing phrase counts")
   num_target_records = 0
   num_pair_records = 0
   last_source = ""
   last_source_count = 0
   targets = []
   source_counts = {}
   target_counts = {}
   while True:
     record = it.GetStr()
     if not record:
       break
     source, target = record[0].split("\t")
     count = int(record[1])
     if source:
       if source != last_source:
         if last_source_count and targets:
           self.ProcessRecord(
             last_source, last_source_count, targets, target_counts)
         targets = []
         last_source = source
         num_pair_records += 1
         if num_pair_records % 10000 == 0:
           logger.info("Processing phrase pair counts: {} records".format(num_pair_records))
       if target:
         targets.append((target, count))
       else:
         last_source_count = count
         source_counts[source] = count
     else:
       target_counts[target] = count
       num_target_records += 1
       if num_target_records % 100000 == 0:
         logger.info("Reading target counts: {} records".format(num_target_records))
     it.Next()
   if last_source_count and targets:
     self.ProcessRecord(
       last_source, last_source_count, targets, target_counts)
   logger.info("Process done: elapsed_time={:.2f}s".format(
     time.time() - start_time))
 def Dump(self):
     logger.info(
         "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB".
         format(self.num_batches + 1,
                time.time() - self.start_time,
                tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024))
     logger.info(
         "Batch {} dumping: sentences={}, words={}, unique_words={}".format(
             self.num_batches + 1, self.num_sentences, self.num_words,
             self.mem_phrase_count.Count()))
     start_time = time.time()
     fill_ratio = min(self.num_words / BATCH_MAX_WORDS, 1.0)
     dbm_phrase_count_path = "{}-phrase-count-{:08d}.tks".format(
         self.data_prefix, self.num_batches)
     dbm_phrase_count = tkrzw.DBM()
     dbm_phrase_count.Open(dbm_phrase_count_path,
                           True,
                           dbm="SkipDBM",
                           truncate=True,
                           insert_in_order=True,
                           offset_width=4,
                           step_unit=4,
                           max_level=12).OrDie()
     logger.info("Batch {} word count dumping: dest={}".format(
         self.num_batches + 1, dbm_phrase_count_path))
     dbm_phrase_count.Set("", self.num_sentences).OrDie()
     it = self.mem_phrase_count.MakeIterator()
     it.First()
     min_phrase_count = max(
         math.ceil(MIN_PHRASE_COUNT_IN_BATCH * fill_ratio), 2)
     while True:
         record = it.Get()
         if not record:
             break
         phrase = record[0]
         count = struct.unpack(">q", record[1])[0]
         if count >= min_phrase_count:
             dbm_phrase_count.Set(phrase, count).OrDie()
         it.Remove()
     dbm_phrase_count.Close().OrDie()
     logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
     self.num_batches += 1
     merge_db_unit = 1
     while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0:
         merge_db_unit *= MERGE_DB_UNIT
         self.ReduceDatabases(merge_db_unit)
     self.num_words_since_cutoff = 0
Exemple #5
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}".format(self.input_path))
     word_dbm = tkrzw.DBM()
     word_dbm.Open(self.input_path, False).OrDie()
     it = word_dbm.MakeIterator()
     it.First()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record: break
         self.PrintRecord(json.loads(record[1]))
         num_records += 1
         if num_records % 10000 == 0:
             logger.info("Processing: records={}".format(num_records))
         it.Next()
     word_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Exemple #6
0
 def ReadSynsetIndex(self):
   logger.info("Reading synset index: input_path={}".format(self.input_path))
   synset_index = collections.defaultdict(set)
   input_dbm = tkrzw.DBM()
   input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
   num_words = 0
   it = input_dbm.MakeIterator()
   it.First()
   while True:
     record = it.GetStr()
     if not record: break
     key, serialized = record
     entry = json.loads(serialized)
     for item in entry["item"]:
       word = item["word"]
       synset = item["synset"]
       synset_index[word].add(synset)
     num_words += 1
     if num_words % 10000 == 0:
       logger.info("Reading synsets: words={}".format(num_words))
     it.Next()
   logger.info("Reading synset index done: records={}".format(len(synset_index)))
   return synset_index
Exemple #7
0
 def Run(self):
   start_time = time.time()
   logger.info("Process started: input_path={}, output_path={}".format(
     str(self.input_path), self.output_path))
   input_dbm = tkrzw.DBM()
   input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
   os.makedirs(self.output_path, exist_ok=True)
   words = self.ListUpWords(input_dbm)
   keys = sorted(set([tkrzw_dict.NormalizeWord(word) for word, prob in words.items()]))
   key_prefixes = set()
   for key in keys:
     key_prefixes.add(GetKeyPrefix(key))
   key_prefixes = sorted(list(key_prefixes), key=lambda x: 1000 if x == "_" else ord(x))
   self.MakeMain(input_dbm, keys, words)
   self.MakeNavigation(key_prefixes)
   self.MakeOverview()
   self.MakeStyle()
   self.MakePackage(key_prefixes)
   input_dbm.Close().OrDie()
   for label, count in self.label_counters.items():
     logger.info("Adopted label: {}: {}".format(label, count))
   logger.info("Stats: num_words={}, num_trans={}, num_items={}, num_aux_items={}".format(
     self.num_words, self.num_trans, self.num_items, self.num_aux_items))
   logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     logger.info("Getting AOA records")
     num_entries = 0
     records = []
     it.First()
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             trans = word_entry.get("translation")
             if not trans: continue
             trans = trans[:8]
             labels = set()
             poses = {}
             for item in word_entry["item"]:
                 labels.add(item["label"])
                 poses[item["pos"]] = True
             poses = poses.keys()
             aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept")
                    or word_entry.get("aoa_base"))
             if aoa:
                 aoa = float(aoa)
             else:
                 if len(labels) < 2:
                     continue
                 prob = word_entry.get("probability")
                 if not prob: continue
                 prob = float(prob)
                 if word.count(" "):
                     token_probs = []
                     for token in word.split(" "):
                         token_serialized = input_dbm.GetStr(token.lower())
                         token_prob = 0.0
                         if token_serialized:
                             for token_entry in json.loads(
                                     token_serialized):
                                 token_word = token_entry["word"]
                                 if token_word != token: continue
                                 token_prob = float(
                                     token_entry.get("probability") or 0.0)
                         token_probs.append(token_prob)
                     min_token_prob = min(token_probs)
                     if min_token_prob > prob:
                         prob = (prob * min_token_prob)**0.5
                 aoa = math.log(prob + 0.00000001) * -1 + 3.5
             record = (word, aoa, poses, trans)
             records.append(record)
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info(
                 "Getting AOA records: entries={}".format(num_entries))
         it.Next()
     logger.info("Reading done: entries={}".format(num_entries))
     input_dbm.Close().OrDie()
     records = sorted(records, key=lambda x: x[1])
     output_dbm = tkrzw.DBM()
     output_dbm.Open(self.output_path,
                     True,
                     dbm="SkipDBM",
                     truncate=True,
                     insert_in_order=True).OrDie()
     num_entries = 0
     for word, aoa, poses, trans in records:
         key = "{:05d}".format(num_entries)
         fields = [word]
         fields.append("{:.2f}".format(aoa))
         fields.append(",".join(poses))
         fields.append(",".join(trans))
         output_dbm.Set(key, "\t".join(fields)).OrDie()
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Writing: entries={}".format(num_entries))
         if num_entries >= 100000:
             break
     logger.info("Writing done: entries={}".format(num_entries))
     output_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Exemple #9
0
 def AppendTranslations(self, wnjpn_trans, aux_trans, subaux_trans,
                        synset_index):
     start_time = time.time()
     logger.info(
         "Appending translations: input_path={}, output_path={}".format(
             self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     phrase_prob_dbm = None
     if self.phrase_prob_path:
         phrase_prob_dbm = tkrzw.DBM()
         phrase_prob_dbm.Open(self.phrase_prob_path, False,
                              dbm="HashDBM").OrDie()
     rev_prob_dbm = None
     if self.rev_prob_path:
         rev_prob_dbm = tkrzw.DBM()
         rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie()
     tokenizer = tkrzw_tokenizer.Tokenizer()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     output_dbm = tkrzw.DBM()
     num_buckets = input_dbm.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     num_words = 0
     num_orig_trans = 0
     num_match_trans = 0
     num_voted_trans = 0
     num_items = 0
     num_items_bare = 0
     num_items_rescued = 0
     it = input_dbm.MakeIterator()
     it.First()
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         items = entry["item"]
         spell_ratios = {}
         for item in items:
             word = item["word"]
             phrase_prob = float(item.get("prob") or 0.0)
             spell_ratios[word] = phrase_prob + 0.00000001
         sum_prob = 0.0
         for word, prob in spell_ratios.items():
             sum_prob += prob
         for word, prob in list(spell_ratios.items()):
             spell_ratios[word] = prob / sum_prob
         for item in items:
             word = item["word"]
             pos = item["pos"]
             synset = item["synset"]
             links = item.get("link") or {}
             phrase_prob = float(item.get("prob") or 0.0)
             spell_ratio = spell_ratios[word]
             synonyms = item.get("synonym") or []
             hypernyms = item.get("hypernym") or []
             hyponyms = item.get("hyponym") or []
             similars = item.get("similar") or []
             derivatives = item.get("derivative") or []
             synonym_ids = links.get("synonym") or []
             hypernym_ids = links.get("hypernym") or []
             hyponym_ids = links.get("hyponym") or []
             similar_ids = links.get("similar") or []
             derivative_ids = links.get("derivative") or []
             item_tran_pairs = wnjpn_trans.get(synset) or []
             item_aux_trans = aux_trans.get(word) or []
             item_aux_trans.extend(subaux_trans.get(word) or [])
             self.NormalizeTranslationList(tokenizer, pos, item_aux_trans)
             item_trans = []
             hand_trans = set()
             for tran, src in item_tran_pairs:
                 if src == "mono":
                     hit = False
                     for item_aux_tran in item_aux_trans:
                         dist = tkrzw.Utility.EditDistanceLev(
                             tran, item_aux_tran)
                         dist_ratio = dist / max(len(tran),
                                                 len(item_aux_tran))
                         if dist < 0.3:
                             hit = True
                     if not hit:
                         continue
                 item_trans.append(tran)
                 if src == "hand":
                     hand_trans.add(tran)
             self.NormalizeTranslationList(tokenizer, pos, item_trans)
             num_items += 1
             bare = not item_trans
             if bare:
                 num_items_bare += 1
             num_orig_trans += len(item_trans)
             syno_tran_counts = collections.defaultdict(int)
             hyper_tran_counts = collections.defaultdict(int)
             hypo_tran_counts = collections.defaultdict(int)
             similar_tran_counts = collections.defaultdict(int)
             derivative_tran_counts = collections.defaultdict(int)
             aux_trans_set = set(item_aux_trans)
             checked_words = set()
             checked_ids = set([synset])
             voted_rel_words = set()
             voted_rel_records = set()
             for rel_words, rel_ids, tran_counts in (
                 (synonyms, synonym_ids, syno_tran_counts),
                 (hypernyms, hypernym_ids, hyper_tran_counts),
                 (hyponyms, hyponym_ids, hypo_tran_counts),
                 (similars, similar_ids, similar_tran_counts),
                 (derivatives, derivative_ids, derivative_tran_counts)):
                 for rel_word in rel_words:
                     is_similar = self.AreSimilarWords(rel_word, word)
                     rel_phrase_prob = 0.0
                     if phrase_prob_dbm:
                         rel_phrase_prob = self.GetPhraseProb(
                             phrase_prob_dbm, tokenizer, "en", rel_word)
                     mean_prob = (phrase_prob * rel_phrase_prob)**0.5
                     rel_aux_trans = []
                     if rel_word not in checked_words:
                         checked_words.add(rel_word)
                         tmp_aux_trans = aux_trans.get(rel_word)
                         if tmp_aux_trans:
                             rel_aux_trans.extend(tmp_aux_trans)
                     for rel_id in synset_index[rel_word]:
                         if rel_id not in rel_ids: continue
                         if rel_id not in checked_ids:
                             checked_ids.add(rel_id)
                             tmp_aux_trans = wnjpn_trans.get(rel_id)
                             if tmp_aux_trans:
                                 tmp_aux_trans = [
                                     x[0] for x in tmp_aux_trans
                                 ]
                                 rel_aux_trans.extend(tmp_aux_trans)
                     if rel_aux_trans:
                         self.NormalizeTranslationList(
                             tokenizer, pos, rel_aux_trans)
                         if not is_similar and mean_prob < 0.0005:
                             for item_aux_tran in item_aux_trans:
                                 if regex.fullmatch(r"[\p{Hiragana}]{,3}",
                                                    item_aux_tran):
                                     continue
                                 if item_aux_tran in rel_aux_trans:
                                     valid_pos = self.IsValidPosTran(
                                         tokenizer, pos, item_aux_tran)
                                     if valid_pos and item_aux_tran not in item_trans:
                                         item_trans.append(item_aux_tran)
                                         num_match_trans += 1
                         if mean_prob < 0.005:
                             voted_top = rel_word
                             for voted_rel_word in voted_rel_words:
                                 if self.AreSimilarWords(
                                         rel_word, voted_rel_word):
                                     voted_top = voted_rel_word
                                     break
                             voted_rel_words.add(rel_word)
                             for rel_aux_tran in set(rel_aux_trans):
                                 voted_record = (voted_top, rel_aux_tran)
                                 if voted_record in voted_rel_records:
                                     continue
                                 voted_rel_records.add(voted_record)
                                 tran_counts[rel_aux_tran] += 1
             if bare:
                 for deri_tran, count in derivative_tran_counts.items():
                     syno_tran_counts[
                         deri_tran] = syno_tran_counts[deri_tran] + count
                 derivative_tran_counts.clear()
             for syno_tran, count in syno_tran_counts.items():
                 if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran):
                     continue
                 if syno_tran in hyper_tran_counts: count += 1
                 if syno_tran in hypo_tran_counts: count += 1
                 if syno_tran in similar_tran_counts: count += 1
                 if syno_tran in derivative_tran_counts: count += 1
                 if bare and syno_tran in aux_trans_set: count += 1
                 if count >= 3 and syno_tran not in item_trans:
                     valid_pos = self.IsValidPosTran(
                         tokenizer, pos, syno_tran)
                     if valid_pos and syno_tran not in item_trans:
                         item_trans.append(syno_tran)
                         num_voted_trans += 1
             item_score = 0.0
             if item_trans:
                 if bare:
                     num_items_rescued += 1
                 if rev_prob_dbm or tran_prob_dbm:
                     item_trans, item_score, tran_scores = (
                         self.SortWordsByScore(word, item_trans, hand_trans,
                                               rev_prob_dbm, tokenizer,
                                               tran_prob_dbm))
                 item[
                     "translation"] = item_trans[:MAX_TRANSLATIONS_PER_WORD]
                 if tran_scores:
                     tran_score_map = {}
                     for tran, tran_score in tran_scores[:
                                                         MAX_TRANSLATIONS_PER_WORD]:
                         tran_score_map[tran] = "{:.6f}".format(
                             tran_score).replace("0.", ".")
                     item["translation_score"] = tran_score_map
             item_score += spell_ratio * 0.5
             item["score"] = "{:.8f}".format(item_score).replace("0.", ".")
             if "link" in item:
                 del item["link"]
         if rev_prob_dbm:
             entry["item"] = sorted(
                 items,
                 key=lambda item: float(item.get("score") or 0.0),
                 reverse=True)
         serialized = json.dumps(entry,
                                 separators=(",", ":"),
                                 ensure_ascii=False)
         output_dbm.Set(key, serialized).OrDie()
         num_words += 1
         if num_words % 10000 == 0:
             logger.info("Saving words: words={}".format(num_words))
         it.Next()
     output_dbm.Close().OrDie()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     if rev_prob_dbm:
         rev_prob_dbm.Close().OrDie()
     if phrase_prob_dbm:
         phrase_prob_dbm.Close().OrDie()
     input_dbm.Close().OrDie()
     logger.info(
         "Aappending translations done: words={}, elapsed_time={:.2f}s".
         format(num_words,
                time.time() - start_time))
     logger.info(
         "Stats: orig={}, match={}, voted={}, items={}, bare={}, rescued={}"
         .format(num_orig_trans, num_match_trans, num_voted_trans,
                 num_items, num_items_bare, num_items_rescued))
def main():
  args = sys.argv[1:]
  if len(args) < 1:
    raise ValueError("invalid arguments")
  input_path = args[0]
  is_synset = False
  for arg in args[1:]:
    if arg == "--synset":
      is_synset = True
    else:
      raise ValueError("invalid arguments")
  tokenizer = tkrzw_tokenizer.Tokenizer()
  dbm = tkrzw.DBM()
  dbm.Open(input_path, False).OrDie()
  it = dbm.MakeIterator()
  it.First().OrDie()
  while True:
    record = it.GetStr()
    if not record: break;
    key, data = record
    entries = json.loads(data)
    for entry in entries:
      word = entry["word"]
      if is_synset:
        for item in entry["item"]:
          pos = item["pos"]
          text = item["text"]
          syn_match = regex.search(r"\[synset\]: ([-0-9a-z]+)", text)
          tran_match = regex.search(r"\[translation\]: ([^\[]+)", text)
          if syn_match and tran_match:
            syn = syn_match.group(1)
            tran = tran_match.group(1)
            tran = regex.sub(r"\([^)]+\)", "", tran)
            norm_trans = []
            uniq_trans = set()
            for syn_tran in tran.split(","):
              norm_tran = tokenizer.NormalizeJaWordForPos(pos, syn_tran.strip())
              if norm_tran and norm_tran not in uniq_trans:
                norm_trans.append(norm_tran)
                uniq_trans.add(norm_tran)
            if norm_trans:
              print("{}:{}\t{}".format(word, syn, "\t".join(norm_trans)))
      else:
        poses = set()
        tran_poses = {}
        for item in entry["item"]:
          pos = item["pos"]
          text = item["text"]
          poses.add(pos)
          tran_match = regex.search(r"\[translation\]: ([^\[]+)", text)
          if tran_match:
            tran = tran_match.group(1)
            tran = regex.sub(r"\([^)]+\)", "", tran)
            for syn_tran in tran.split(","):
              syn_tran = syn_tran.strip()
              if syn_tran and syn_tran not in tran_poses:
                tran_poses[syn_tran] = pos
        only_pos = list(poses)[0] if len(poses) == 1 else None
        translations = entry.get("translation")
        if translations:
          norm_trans = []
          uniq_trans = set()
          for tran in translations:
            pos = only_pos
            if not pos:
              pos = tran_poses.get(tran)
            norm_tran = tokenizer.NormalizeJaWordForPos(pos, tran) if pos else tran
            if norm_tran and norm_tran not in uniq_trans:
              norm_trans.append(norm_tran)
              uniq_trans.add(norm_tran)
          if norm_trans:
            print("{}\t{}".format(word, "\t".join(norm_trans)))
    it.Next()
  dbm.Close().OrDie()
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     index = collections.defaultdict(list)
     infl_names = ("noun_plural", "verb_singular",
                   "verb_present_participle", "verb_past",
                   "verb_past_participle", "adjective_comparative",
                   "adjective_superative", "adverb_comparative",
                   "adverb_superative")
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             prob = max(float(word_entry.get("probability") or "0"),
                        0.0000001)
             score = prob * math.log2(len(word_entry["item"]))
             if "translation" in word_entry:
                 score *= 2
             inflections = set()
             for infl_name in infl_names:
                 inflection = word_entry.get(infl_name)
                 if inflection:
                     for infl_value in regex.split(r"[,|]", inflection):
                         infl_value = tkrzw_dict.NormalizeWord(
                             infl_value.strip())
                         if not regex.search(r"\p{Latin}", infl_value):
                             continue
                         inflections.add(infl_value)
             for inflection in inflections:
                 index[inflection].append((word, score))
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}".format(num_entries))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}".format(num_entries))
     output_dbm = tkrzw.DBM()
     num_buckets = len(index) * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     num_entries = 0
     for inflection, scores in index.items():
         scores = sorted(scores, key=lambda x: x[1], reverse=True)
         words = [x[0] for x in scores]
         output_dbm.Set(inflection, "\t".join(words)).OrDie()
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Writing: entries={}".format(num_entries))
     output_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
 def Dump(self):
     logger.info(
         "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB".
         format(self.num_batches + 1,
                time.time() - self.start_time,
                tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024))
     logger.info(
         ("Batch {} dumping: documents={}, sentences={}, words={}," +
          " unique_words={}, unique_cooc={}").format(
              self.num_batches + 1, self.num_documents, self.num_sentences,
              self.num_words, self.mem_word_count.Count(),
              self.mem_cooc_count.Count()))
     start_time = time.time()
     fill_ratio = min(self.num_words / BATCH_MAX_WORDS, 1.0)
     dbm_cooc_count_path = "{}-cooc-count-{:08d}.tks".format(
         self.data_prefix, self.num_batches)
     dbm_cooc_count = tkrzw.DBM()
     dbm_cooc_count.Open(dbm_cooc_count_path,
                         True,
                         dbm="SkipDBM",
                         truncate=True,
                         insert_in_order=True,
                         offset_width=5,
                         step_unit=16,
                         max_level=8).OrDie()
     logger.info("Batch {} cooc count dumping: dest={}".format(
         self.num_batches + 1, dbm_cooc_count_path))
     dbm_cooc_count.Set("", self.num_sentences).OrDie()
     it = self.mem_cooc_count.MakeIterator()
     it.First()
     min_word_count = math.ceil(MIN_WORD_COUNT_IN_BATCH * fill_ratio)
     if MIN_WORD_COUNT_IN_BATCH >= 2:
         min_word_count = max(min_word_count, 2)
     min_count = math.ceil(tkrzw_dict.COOC_BASE_SCORE *
                           MIN_COOC_COUNT_IN_BATCH * fill_ratio)
     cur_word = None
     cur_word_count = 0
     cur_word_weight = 1.0
     cooc_words = []
     while True:
         record = it.Get()
         if not record: break
         word_pair = record[0].decode()
         count = struct.unpack(">q", record[1])[0]
         word, cooc_word = word_pair.split(" ")
         if cur_word != word:
             if cur_word and cooc_words:
                 self.DumpCoocWords(cur_word, cooc_words, dbm_cooc_count)
             cur_word = word
             cur_word_count = struct.unpack(
                 ">q", self.mem_word_count.Get(cur_word))[0]
             cur_word_weight = 1.0
             if tkrzw_dict.IsNumericWord(cur_word):
                 cur_word_weight = tkrzw_dict.NUMERIC_WORD_WEIGHT
             elif tkrzw_dict.IsStopWord(self.language, cur_word):
                 cur_word_weight = tkrzw_dict.STOP_WORD_WEIGHT
             cooc_words = []
         if cur_word_count * cur_word_weight >= min_word_count:
             cooc_count = struct.unpack(
                 ">q", self.mem_word_count.Get(cooc_word))[0]
             cooc_weight = 1.0
             if tkrzw_dict.IsNumericWord(cooc_word):
                 cooc_weight = tkrzw_dict.NUMERIC_WORD_WEIGHT
             elif tkrzw_dict.IsStopWord(self.language, cooc_word):
                 cooc_weight = tkrzw_dict.STOP_WORD_WEIGHT
             cooc_prob = cooc_count / self.num_sentences
             cooc_idf = min(
                 math.log(cooc_prob) * -1, tkrzw_dict.MAX_IDF_WEIGHT)
             score = count * (cooc_idf**tkrzw_dict.IDF_POWER)
             score *= cur_word_weight * cooc_weight
             if (cooc_count * cooc_weight >= min_word_count and
                     count * cur_word_weight * cooc_weight >= min_count):
                 cooc_words.append((cooc_word, count, score))
         it.Remove()
     if cur_word and cooc_words:
         self.DumpCoocWords(cur_word, cooc_words, dbm_cooc_count)
     dbm_cooc_count.Close().OrDie()
     dbm_word_count_path = "{}-word-count-{:08d}.tks".format(
         self.data_prefix, self.num_batches)
     dbm_word_count = tkrzw.DBM()
     dbm_word_count.Open(dbm_word_count_path,
                         True,
                         dbm="SkipDBM",
                         truncate=True,
                         insert_in_order=True,
                         offset_width=4,
                         step_unit=4,
                         max_level=12).OrDie()
     logger.info("Batch {} word count dumping: dest={}".format(
         self.num_batches + 1, dbm_word_count_path))
     dbm_word_count.Set("", self.num_sentences).OrDie()
     it = self.mem_word_count.MakeIterator()
     it.First()
     while True:
         record = it.Get()
         if not record:
             break
         word = record[0]
         count = struct.unpack(">q", record[1])[0]
         if count >= min_word_count:
             dbm_word_count.Set(word, count).OrDie()
         it.Remove()
     dbm_word_count.Close().OrDie()
     logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
     self.num_batches += 1
     merge_db_unit = 1
     while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0:
         merge_db_unit *= MERGE_DB_UNIT
         self.ReduceDatabases(merge_db_unit)
     self.num_words_since_cutoff = 0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     logger.info("Getting AOA records")
     it.First()
     num_entries = 0
     aoa_records = {}
     real_aoa_probs = collections.defaultdict(list)
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept")
                    or word_entry.get("aoa_base"))
             if aoa:
                 aoa_records[word] = float(aoa)
             real_aoa = word_entry.get("aoa")
             prob = word_entry.get("probability")
             if real_aoa and prob:
                 real_aoa_probs[int(float(real_aoa))].append(float(prob))
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info(
                 "Getting AOA records: entries={}".format(num_entries))
         it.Next()
     aoa_prob_map = {}
     min_aoa_prob = 0.0001
     for aoa_age, probs in sorted(list(real_aoa_probs.items())):
         if aoa_age < 4 or aoa_age > 20: continue
         prob_mean = sum(probs) / len(probs)
         min_aoa_prob = min(prob_mean, min_aoa_prob)
         aoa_prob_map[aoa_age] = min(min_aoa_prob, 0.01)
     it.First()
     num_entries = 0
     scores = []
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         max_score = 0
         for word_entry in entry:
             word = word_entry["word"]
             prob = float(word_entry.get("probability") or "0")
             aoa_prob = 0
             real_aoa = word_entry.get("aoa")
             if real_aoa:
                 aoa_prob = float(
                     aoa_prob_map.get(int(float(real_aoa))) or 0)
                 prob += aoa_prob
             prob_score = max(prob**0.5, 0.00001)
             aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept")
                    or word_entry.get("aoa_base"))
             if aoa:
                 aoa = float(aoa)
             else:
                 aoa = sys.maxsize
                 tokens = word.split(" ")
                 if len(tokens) > 1:
                     max_aoa = 0
                     for token in tokens:
                         token_aoa = aoa_records.get(token)
                         if token_aoa:
                             max_aoa = max(max_aoa, float(token_aoa))
                         else:
                             max_aoa = sys.maxsize
                     if max_aoa < sys.maxsize:
                         aoa = max_aoa + len(tokens) - 1
             aoa_score = (25 - min(aoa, 20.0)) / 10.0
             tran_score = 1.0
             if "translation" in word_entry:
                 tran_score += 1.0
             if tran_prob_dbm:
                 tsv = tran_prob_dbm.GetStr(key)
                 if tsv:
                     fields = tsv.split("\t")
                     max_tran_prob = 0.0
                     for i in range(0, len(fields), 3):
                         tran_src, tran_trg, tran_prob = fields[i], fields[
                             i + 1], float(fields[i + 2])
                         if tran_src != word: continue
                         if not regex.search(r"[\p{Han}]", tran_trg):
                             prob *= 0.5
                         max_tran_prob = max(max_tran_prob, tran_prob)
                     tran_score += max_tran_prob
             item_score = math.log2(len(word_entry["item"]) + 1)
             labels = set()
             for item in word_entry["item"]:
                 labels.add(item["label"])
             label_score = math.log2(len(labels) + 1)
             children = word_entry.get("child")
             child_score = math.log2((len(children) if children else 0) + 4)
             score = prob_score * aoa_score * tran_score * item_score * label_score * child_score
             if regex.fullmatch(r"\d+", word):
                 score *= 0.1
             elif regex.match(r"\d", word):
                 score *= 0.3
             elif regex.search(r"^[^\p{Latin}]", word) or regex.search(
                     r"[^\p{Latin}]$", word):
                 score *= 0.5
             elif regex.search(r".[\p{Lu}]", word):
                 score *= 0.5
             max_score = max(max_score, score)
         scores.append((key, max_score))
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}".format(num_entries))
         it.Next()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}".format(num_entries))
     scores = sorted(scores, key=lambda x: x[1], reverse=True)
     with open(self.output_path, "w") as out_file:
         num_entries = 0
         for key, score in scores:
             print(key, file=out_file)
             num_entries += 1
             if num_entries % 10000 == 0:
                 logger.info("Writing: entries={}".format(num_entries))
         logger.info("Writing done: entries={}".format(num_entries))
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
 def __init__(self, data_prefix, language):
     self.language = language
     self.tokenizer = tkrzw_tokenizer.Tokenizer()
     word_score_path = tkrzw_dict.GetCoocScorePath(data_prefix)
     self.word_score_dbm = tkrzw.DBM()
     self.word_score_dbm.Open(word_score_path, False, dbm="HashDBM").OrDie()
Exemple #15
0
    def DivideCoocCount(self, cooc_count_path, word_prob_path, cooc_prob_path):
        start_time = time.time()
        logger.info(
            "Writing the coocccurrence probability database: src={}, dest={}".
            format(cooc_count_path, cooc_prob_path))
        cooc_count_dbm = tkrzw.DBM()
        cooc_count_dbm.Open(cooc_count_path, False, dbm="SkipDBM").OrDie()
        word_prob_dbm = tkrzw.DBM()
        word_prob_dbm.Open(word_prob_path, False, dbm="HashDBM").OrDie()
        cooc_prob_dbm = tkrzw.DBM()
        num_buckets = word_prob_dbm.Count() * 2
        cooc_prob_dbm.Open(cooc_prob_path,
                           True,
                           dbm="HashDBM",
                           truncate=True,
                           offset_width=4,
                           num_buckets=num_buckets).OrDie()
        word_prob_cache = tkrzw.DBM()
        word_prob_cache.Open("",
                             True,
                             dbm="CacheDBM",
                             cap_rec_num=PROB_CACHE_CAPACITY)

        def GetWordProb(key):
            value = word_prob_cache.Get(key)
            if value:
                return float(value)
            value = word_prob_dbm.GetStr(key)
            if value:
                word_prob_cache.Set(key, value)
                return float(value)
            return None

        it = cooc_count_dbm.MakeIterator()
        it.First()
        record = it.GetStr()
        if not record or len(record[0]) != 0:
            raise RuntimeError("invalid first record")
        num_sentences = int(record[1])
        it.Next()
        num_records = 0
        cur_word = None
        cur_word_prob = 0
        cooc_words = []
        while True:
            record = it.GetStr()
            if not record:
                break
            word_pair = record[0]
            count = int(record[1]) / tkrzw_dict.COOC_BASE_SCORE
            word, cooc_word = word_pair.split(" ")
            if cur_word != word:
                if cooc_words:
                    self.SaveCoocWords(cur_word, cooc_words, cooc_prob_dbm)
                    num_records += 1
                    if num_records % 1000 == 0:
                        logger.info(
                            "Dividing coocurrence counts: {} records".format(
                                num_records))
                cur_word = word
                cur_word_prob = GetWordProb(cur_word)
                cooc_words = []
            if cur_word_prob:
                cooc_prob = GetWordProb(cooc_word)
                if cooc_prob:
                    cooc_idf = min(
                        math.log(cooc_prob) * -1, tkrzw_dict.MAX_IDF_WEIGHT)
                    cur_word_count = max(round(cur_word_prob * num_sentences),
                                         1)
                    prob = count / cur_word_count
                    score = prob * (cooc_idf**tkrzw_dict.IDF_POWER)
                    if tkrzw_dict.IsNumericWord(cooc_word):
                        score *= tkrzw_dict.NUMERIC_WORD_WEIGHT
                    elif tkrzw_dict.IsStopWord(self.language, cooc_word):
                        score *= tkrzw_dict.STOP_WORD_WEIGHT
                    cooc_words.append((cooc_word, prob, score))
            it.Next()
        if cur_word and cooc_words:
            self.SaveCoocWords(cur_word, cooc_words, cooc_prob_dbm)
        cooc_prob_dbm.Close().OrDie()
        word_prob_dbm.Close().OrDie()
        cooc_count_dbm.Close().OrDie()
        logger.info(
            "Writing the cooccurrence probability database done: elapsed_time={:.2f}s"
            .format(time.time() - start_time))
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths,
        yomi_paths, min_phrase_prob, min_tran_prob):
    logger.info("Start the process")
    phrase_prob_dbm = None
    if phrase_prob_path:
        logger.info("Opening the phrase prob DB: " + phrase_prob_path)
        phrase_prob_dbm = tkrzw.DBM()
        phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie()
    rev_prob_dbm = None
    if rev_prob_path:
        logger.info("Opening the reverse prob DB: " + rev_prob_path)
        rev_prob_dbm = tkrzw.DBM()
        rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie()
    tran_prob_dbm = None
    if tran_prob_path:
        logger.info("Opening the tran prob DB: " + tran_prob_path)
        tran_prob_dbm = tkrzw.DBM()
        tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie()
    aux_trans = collections.defaultdict(list)
    for tran_aux_path in tran_aux_paths.split(","):
        tran_aux_path = tran_aux_path.strip()
        if tran_aux_path:
            logger.info("Reading the tran aux file: " + tran_aux_path)
            with open(tran_aux_path) as input_file:
                uniq_keys = set()
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 2: continue
                    word = fields[0]
                    for tran in fields[1:]:
                        uniq_key = word + ":" + tran
                        if uniq_key in uniq_keys: continue
                        aux_trans[word].append(tran)
                        uniq_keys.add(uniq_key)
    yomis = set()
    for yomi_path in yomi_paths.split(","):
        yomi_path = yomi_path.strip()
        if yomi_path:
            logger.info("Reading the yomi file: " + yomi_path)
            with open(yomi_path) as input_file:
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 1: continue
                    yomis.add(fields[0])
    logger.info("Processing the gross.")
    tokenizer = tkrzw_tokenizer.Tokenizer()
    word_dict = collections.defaultdict(list)
    alt_source = None
    alt_targets = None
    num_lines = 0
    for line in sys.stdin:
        num_lines += 1
        if num_lines % 10000 == 0:
            logger.info("Processing the gross: {} lines: {} items".format(
                num_lines, len(word_dict)))
        fields = line.strip().split("\t")
        if len(fields) != 3: continue
        word, pos, text = fields
        if pos == "alternative":
            alt_source = word
            alt_targets = set()
            for alt in regex.split(r"[,;]", text):
                if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+",
                                   alt):
                    alt_targets.add(alt)
            continue
        text = regex.sub(r"\.$", "", text).strip()
        for tran in regex.split(r"[,;]", text):
            tran = tran.strip()
            if pos == "verb":
                tran = regex.sub(r"^to ", "", tran)
            if pos == "noun":
                tran = regex.sub(r"^(a|an|the) ", "", tran)
            tran = regex.sub("^[-~] ", "", tran)
            tran = regex.sub(" [-~]$", "", tran)
            if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue
            tokens = tran.split(" ")
            if len(tokens) < 1 or len(tokens) > 4: continue
            word_dict[tran].append((pos, word))
            if alt_source == word:
                for alt in alt_targets:
                    word_dict[tran].append((pos, alt))
    norm_word_dict = collections.defaultdict(list)
    for word, trans in word_dict.items():
        scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer,
                                                phrase_prob_dbm, rev_prob_dbm,
                                                tran_prob_dbm, aux_trans,
                                                yomis, min_phrase_prob,
                                                min_tran_prob)
        if scored_trans:
            key = tkrzw_dict.NormalizeWord(word)
            norm_word_dict[key].append((word, scored_trans, phrase_prob))
    for key, entries in norm_word_dict.items():
        sum_phrase_prob = 0.0
        for word, scored_trans, phrase_prob in entries:
            sum_phrase_prob += phrase_prob
        for word, scored_trans, phrase_prob in entries:
            if sum_phrase_prob > 0:
                if key == word:
                    if phrase_prob / sum_phrase_prob < 0.6: continue
                else:
                    if phrase_prob / sum_phrase_prob < 0.8: continue
            PrintEntry(word, scored_trans)
    if tran_prob_dbm:
        tran_prob_dbm.Close().OrDie()
    if phrase_prob_dbm:
        phrase_prob_dbm.Close().OrDie()
    logger.info("Process done")
Exemple #17
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     mem_index = tkrzw.DBM()
     mem_index.Open("", True, dbm="BabyDBM").OrDie()
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     num_translations = 0
     tran_dict = set()
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             prob = max(float(word_entry.get("probability") or "0"),
                        0.0000001)
             aoa = min(float(word_entry.get("aoa") or "20"), 20.0)
             score = prob * ((30 - aoa) / 10)
             word_trans = word_entry.get("translation") or []
             phrase_trans = []
             phrases = word_entry.get("phrase")
             if phrases:
                 for phrase in phrases:
                     if phrase.get("p") or phrase.get("i"): continue
                     for phrase_tran in phrase.get("x"):
                         phrase_tran = regex.sub(r"\(.*?\)", "",
                                                 phrase_tran).strip()
                         if phrase_tran:
                             phrase_trans.append(phrase_tran)
             weight_word_trans = []
             for trans, weight in [(word_trans, 1.0), (phrase_trans, 0.5)]:
                 for word_tran in trans:
                     weight_word_trans.append((word_tran, weight))
                     match = regex.search(
                         r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
                     short_word_tran = self.tokenizer.CutJaWordNounParticle(
                         word_tran)
                     if short_word_tran != word_tran:
                         weight_word_trans.append(
                             (short_word_tran, weight * 0.8))
                     match = regex.search(
                         r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
                     match = regex.search(
                         r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
             uniq_trans = set()
             for tran, weight in weight_word_trans:
                 norm_tran = tkrzw_dict.NormalizeWord(tran)
                 if norm_tran in uniq_trans: continue
                 uniq_trans.add(norm_tran)
                 pair = "{}\t{:.8f}".format(word, score * weight)
                 score *= 0.98
                 mem_index.Append(norm_tran, pair, "\t").OrDie()
             for item in word_entry["item"]:
                 if item["label"] in self.supplement_labels:
                     for tran in item["text"].split(","):
                         tran = tran.strip()
                         if tran:
                             tran_dict_key = word + "\t" + tran
                             tran_dict.add(tran_dict_key)
             num_translations += len(uniq_trans)
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}, translations={}".format(
                 num_entries, num_translations))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}, translations={}".format(
         num_entries, num_translations))
     output_dbm = tkrzw.DBM()
     num_buckets = mem_index.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     it = mem_index.MakeIterator()
     it.First()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, value = record
         scored_trans = []
         uniq_words = set()
         fields = value.split("\t")
         for i in range(0, len(fields), 2):
             word = fields[i]
             score = float(fields[i + 1])
             if word in uniq_words: continue
             uniq_words.add(word)
             if tran_prob_dbm:
                 prob = self.GetTranProb(tran_prob_dbm, word, key)
                 tran_dict_key = word + "\t" + key
                 prob = max(prob, 0.000001)
                 if tran_dict_key in tran_dict:
                     prob += 0.1
                 score = (score * prob)**0.5
             scored_trans.append((word, score))
         scored_trans = sorted(scored_trans,
                               key=lambda x: x[1],
                               reverse=True)
         value = "\t".join([x[0] for x in scored_trans])
         output_dbm.Set(key, value).OrDie()
         num_records += 1
         if num_records % 10000 == 0:
             logger.info("Writing: records={}".format(num_records))
         it.Next()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     output_dbm.Close().OrDie()
     logger.info("Writing done: records={}".format(num_records))
     mem_index.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
Exemple #18
0
 def Run(self):
   start_time = time.time()
   logger.info("Process started: input_path={}, output_path={}".format(
     self.input_path, self.output_path))
   input_dbm = tkrzw.DBM()
   input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
   it = input_dbm.MakeIterator()
   logger.info("Getting AOA records")
   it.First()
   num_entries = 0
   aoa_records = {}
   while True:
     record = it.GetStr()
     if not record: break
     key, serialized = record
     entry = json.loads(serialized)
     max_score = 0
     for word_entry in entry:
       word = word_entry["word"]
       aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or
              word_entry.get("aoa_base"))
       if aoa:
         aoa_records[word] = float(aoa)
     num_entries += 1
     if num_entries % 10000 == 0:
       logger.info("Getting AOA records: entries={}".format(num_entries))
     it.Next()
   it.First()
   num_entries = 0
   scores = []
   while True:
     record = it.GetStr()
     if not record: break
     key, serialized = record
     entry = json.loads(serialized)
     max_score = 0
     for word_entry in entry:
       word = word_entry["word"]
       prob = float(word_entry.get("probability") or "0")
       prob_score = max(prob ** 0.5, 0.00001)
       aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or
              word_entry.get("aoa_base"))
       if aoa:
         aoa = float(aoa)
       else:
         aoa = sys.maxsize
         tokens = word.split(" ")
         if len(tokens) > 1:
           max_aoa = 0
           for token in tokens:
             token_aoa = aoa_records.get(token)
             if token_aoa:
               max_aoa = max(max_aoa, float(token_aoa))
             else:
               max_aoa = sys.maxsize
           if max_aoa < sys.maxsize:
             aoa = max_aoa + len(tokens) - 1
       aoa_score = (25 - min(aoa, 20.0)) / 10.0
       tran_score = 1.0 if "translation" in word_entry else 0.5
       item_score = math.log2(len(word_entry["item"]) + 1)
       labels = set()
       for item in word_entry["item"]:
         labels.add(item["label"])
       label_score = len(labels) + 1
       children = word_entry.get("child")
       child_score = math.log2((len(children) if children else 0) + 4)
       score = prob_score * aoa_score * tran_score * item_score * child_score
       if regex.fullmatch(r"\d+", word):
         score *= 0.1
       elif regex.match(r"\d", word):
         score *= 0.3
       elif regex.search(r"^[^\p{Latin}]", word) or regex.search(r"[^\p{Latin}]$", word):
         score *= 0.5
       elif regex.search(r".[\p{Lu}]", word):
         score *= 0.5
       max_score = max(max_score, score)
     scores.append((key, max_score))
     num_entries += 1
     if num_entries % 10000 == 0:
       logger.info("Reading: entries={}".format(num_entries))
     it.Next()
   input_dbm.Close().OrDie()
   logger.info("Reading done: entries={}".format(num_entries))
   scores = sorted(scores, key=lambda x: x[1], reverse=True)
   with open(self.output_path, "w") as out_file:
     num_entries = 0
     for key, score in scores:
       print(key, file=out_file)
       num_entries += 1
       if num_entries % 10000 == 0:
         logger.info("Writing: entries={}".format(num_entries))
     logger.info("Writing done: entries={}".format(num_entries))
   logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
Exemple #19
0
 def __init__(self):
     self.__counter = 0
     self.__db = tkrzw.DBM()
     self.__fname = None
Exemple #20
0
def Run(rev_prob_path, min_count, enough_ef, enough_fe, omit_latin,
        min_score, min_score_large, min_score_stop, max_targets, tran_aux_paths):
  start_time = time.time()
  logger.info("Process started")
  aux_trans = collections.defaultdict(list)
  for tran_aux_path in tran_aux_paths:
    if not tran_aux_path: continue
    logger.info("Reading: " + tran_aux_path)
    with open(tran_aux_path) as input_file:
      for line in input_file:
        fields = line.strip().split("\t")
        if len(fields) < 2: continue
        source = fields[0]
        targets = set()
        for target in fields[1:]:
          target = unicodedata.normalize('NFKC', target)
          target = regex.sub(r"[\p{Ps}\p{Pe}\p{C}]", "", target)
          target = regex.sub(r"\s+", " ", target).strip()
          if target:
            aux_trans[source].append(target)
  rev_prob_dbm = None
  if rev_prob_path:
    logger.info("Reading: " + rev_prob_path)
    rev_prob_dbm = tkrzw.DBM()
    rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie()
  logger.info("Processing records")
  records = {}
  for line in sys.stdin:
    fields = line.strip().split("\t")
    if len(fields) < 3: continue
    source = fields[0]
    count = int(fields[1])
    targets = []
    for field in fields[2:]:
      columns = field.split("|")
      if len(columns) != 3: continue
      targets.append((columns[0], float(columns[1]), float(columns[2])))
    records[source] = (count, targets)
  for source, (count, targets) in records.items():
    if count < min_count: continue
    if len(source) <= 1: continue
    large = bool(regex.search(r"^\p{Lu}", source))
    if large:
      cap_source = source.lower()
    else:
      cap_source = source[0].upper() + source[1:]
    cap_count, cap_targets = 0, []
    if cap_source != source:
      cap_record = records.get(cap_source)
      if cap_record:
        cap_count, cap_targets = cap_record
    if large:
      cap_count *= 5.0
    if count < cap_count: continue
    scored_targets = []
    for target, ef_prob, fe_prob in targets:
      for cap_target, cap_ef_prob, cap_fe_prob in cap_targets:
        if cap_target == target:
          fe_prob += cap_fe_prob
      ef_prob = min(1.0, ef_prob)
      fe_prob = min(1.0, fe_prob)
      score = ((ef_prob ** EF_WEIGHT) * (fe_prob ** FE_WEIGHT)) ** (1 / (EF_WEIGHT + FE_WEIGHT))
      #score = 2 * ef_prob * fe_prob / (ef_prob + fe_prob)
      scored_targets.append((target, score, ef_prob, fe_prob))
    scored_targets = sorted(scored_targets, key=lambda x: x[1], reverse=True)
    source_aux_trans = aux_trans.get(source) or []
    good_targets = []
    for target, score, ef_prob, fe_prob in scored_targets:
      if target in source_aux_trans:
        score *= 1.1
      else:
        is_prefix = False
        is_single_noun = False
        for cmp_target, cmp_score, _, _ in scored_targets:
          if target != cmp_target and cmp_target.startswith(target) and cmp_score >= min_score:
            if (cmp_target == target + "の" or cmp_target == target + "する") and regex.fullmatch(r"\p{Han}+", target):
              is_single_noun = True
            else:
              is_prefix = True
        is_stop = bool(regex.fullmatch(r"[\p{Hiragana}]+", target))
        if omit_latin and regex.search(r"[\p{Latin}]{2,}", target):
          continue
        if len(target) <= 1 and is_prefix and not is_single_noun:
          continue
        if large:
          if score < min_score_large:
            continue
        elif is_stop:
          if score < min_score_stop:
            continue
        else:
          if score < min_score:
            if (regex.search(r"[\p{Latin}]{4,}", source) and not regex.search(r"\d", source) and
                (regex.search(r"[\p{Han}]{2,}", target) or
                 regex.search(r"[\p{Han}][\p{Hiragana}]", target)) and
                (ef_prob >= enough_ef or fe_prob >= enough_fe)):
              pass
            else:
              continue
        norm_source = source.lower()
        norm_target = target.lower()
        if norm_source.find(norm_target) >= 0 or norm_target.find(norm_source) >= 0:
          continue
        if norm_target in ("する", "ます", "より", "から"):
          continue
        if norm_target.startswith("っ") or norm_target.startswith("を"):
          continue
        if norm_target.endswith("っ") or norm_target.endswith("を"):
          continue
        if regex.fullmatch(r"[\p{Hiragana}ー{Latin}]", target):
          continue
        if regex.search(r"^[\p{Hiragana}]+[\p{Han}\p{Katakana}\p{Latin}]", target):
          continue
        elif regex.search(r"[\p{Han}\{Katakana}ー\p{Latin}][は|が|を|と]", target):
          continue
      if len(target) <= 1:
        score *= 0.5
      elif len(target) <= 2:
        score *= 0.9
      if regex.fullmatch(r"[\p{Hiragana}ー]+", target):
        score *= 0.8
      elif regex.search(r"\d", target):
        score *= 0.8
      target = regex.sub(r"([\p{Han}\p{Katakana}ー\p{Latin}])だ", r"\1な", target)
      good_targets.append((target, score, ef_prob, fe_prob))
    if not good_targets: continue
    good_targets = sorted(good_targets, key=lambda x: x[1], reverse=True)
    outputs = []
    for target, score, ef_prob, fe_prob in good_targets[:max_targets]:
      if rev_prob_dbm:
        prob = GetPhraseProb(rev_prob_dbm, "ja", target)
        if prob < MIN_PROB:
          continue
      #outputs.append("{}:{:.3f}:{:.3f}:{:.3f}".format(target, score, ef_prob, fe_prob))
      outputs.append(target)
    if outputs:
      print("{}\t{}".format(source, "\t".join(outputs)))
  if rev_prob_dbm:
    rev_prob_dbm.Close().OrDie()
  logger.info("Process done: elapsed_time={:.2f}s".format(
    time.time() - start_time))
Exemple #21
0
    def Dump(self):
        logger.info(
            "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB".
            format(self.num_batches + 1,
                   time.time() - self.start_time,
                   tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024))
        logger.info(
            "Batch {} dumping: sentences={}, records={}, dup={}, unique_phrases={}"
            .format(self.num_batches + 1, self.num_sentences, self.num_records,
                    self.num_duplications, self.mem_phrase_count.Count()))
        start_time = time.time()
        fill_ratio = min(self.num_records / BATCH_MAX_RECORDS, 1.0)
        dbm_phrase_count_path = "{}-count-{:08d}.tks".format(
            self.data_prefix, self.num_batches)
        dbm_phrase_count = tkrzw.DBM()
        dbm_phrase_count.Open(dbm_phrase_count_path,
                              True,
                              dbm="SkipDBM",
                              truncate=True,
                              insert_in_order=True,
                              offset_width=4,
                              step_unit=4,
                              max_level=12).OrDie()
        logger.info("Batch {} word count dumping: dest={}".format(
            self.num_batches + 1, dbm_phrase_count_path))
        dbm_phrase_count.Set("", self.num_domains).OrDie()
        it = self.mem_phrase_count.MakeIterator()
        it.First()
        min_phrase_count = max(
            math.ceil(MIN_PHRASE_COUNT_IN_BATCH * fill_ratio), 2)
        re_symbol = regex.compile(r"[\p{S}\p{P}]")
        re_double_particle = regex.compile(
            r"^[\p{Hiragana}ー]+ [\p{Hiragana}ー]+")
        re_hiragana_only = regex.compile(r"[ \p{Hiragana}ー]+")
        particles = set([
            "を", "に", "が", "へ", "や", "の", "と", "から", "で", "より", "な", "は", "です",
            "ます", "この", "その", "あの", "こと", "する", "される", "た", "て", "と", "ある",
            "いる", "これ", "それ", "あれ", "れる", "という", "として", "だ", "など"
        ])
        prefixes = [x + " " for x in particles]

        def Output(src_phrase, trg_phrases):
            scored_targets = []
            for trg_phrase, count in trg_phrases:
                score = count
                if trg_phrase:
                    if re_symbol.search(trg_phrase):
                        continue
                    if re_double_particle.search(trg_phrase):
                        score *= 0.5
                    elif trg_phrase in particles:
                        score *= 0.5
                    else:
                        hit = False
                        for prefix in prefixes:
                            if trg_phrase.startswith(prefix):
                                hit = True
                                break
                        if hit:
                            score *= 0.8
                    if re_hiragana_only.fullmatch(trg_phrase):
                        score *= 0.5
                    if len(trg_phrase) <= 1:
                        score *= 0.5
                    elif len(trg_phrase) <= 2:
                        score *= 0.8
                else:
                    score += 1
                scored_targets.append((trg_phrase, count, score))
            scored_targets = sorted(scored_targets,
                                    key=lambda x: x[2],
                                    reverse=True)
            scored_targets = scored_targets[:MAX_TARGETS_IN_BATCH]
            outputs = []
            for trg_phrase, count, score in scored_targets:
                key = src_phrase + "\t" + trg_phrase
                outputs.append((key, count))
            outputs = sorted(outputs)
            for key, value in outputs:
                dbm_phrase_count.Set(key, value).OrDie()

        last_src_phrase = ""
        trg_phrases = []
        while True:
            record = it.Get()
            if not record:
                break
            src_phrase, trg_phrase = record[0].decode().split("\t")
            count = struct.unpack(">q", record[1])[0]
            if src_phrase:
                if src_phrase != last_src_phrase:
                    if trg_phrases:
                        Output(last_src_phrase, trg_phrases)
                    trg_phrases = []
                if count >= min_phrase_count:
                    trg_phrases.append((trg_phrase, count))
                last_src_phrase = src_phrase
            else:
                if count >= min_phrase_count:
                    dbm_phrase_count.Set("\t" + trg_phrase, count).OrDie()
            it.Remove()
        if trg_phrases:
            Output(last_src_phrase, trg_phrases)
        dbm_phrase_count.Close().OrDie()
        logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() -
                                                                start_time))
        self.num_batches += 1
        merge_db_unit = 1
        while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0:
            merge_db_unit *= MERGE_DB_UNIT
            self.ReduceDatabases(merge_db_unit)
Exemple #22
0
def main():
    args = sys.argv[1:]
    if len(args) < 2:
        raise ValueError("invalid arguments")
    data_prefix = args[0]
    phrase_path = args[1]
    searcher = tkrzw_union_searcher.UnionSearcher(data_prefix)
    phrase_dbm = tkrzw.DBM()
    phrase_dbm.Open(phrase_path, False, dbm="HashDBM").OrDie()
    parent_index = collections.defaultdict(list)
    page_index = 1
    while True:
        result = searcher.SearchByGrade(100, page_index, True)
        if not result: break
        for entry in result:
            word = entry["word"]
            prob = max(float(entry.get("probability") or 0.0), 0.000001)
            item_labels = []
            for item in entry["item"]:
                label = item["label"]
                if not label in item_labels:
                    item_labels.append(label)
            if "wn" not in item_labels: continue
            features = GetFeatures(searcher, entry)
            rel_words = {}
            normals = []
            alternatives = entry.get("alternative") or []
            suffix_pairs = [("se", "ze"), ("ence", "ense"),
                            ("isation", "ization"), ("our", "or"),
                            ("og", "ogue"), ("re", "er"), ("l", "ll")]
            for gb_suffix, us_suffix in suffix_pairs:
                if word.endswith(gb_suffix):
                    us_word = word[:-len(gb_suffix)] + us_suffix
                    if us_word in normals: continue
                    if us_word in alternatives and searcher.CheckExact(
                            us_word):
                        normals.append(us_word)
            for alt in alternatives:
                if alt in normals: continue
                if word.count(" ") == alt.count(" "): continue
                dist = tkrzw.Utility.EditDistanceLev(word, alt)
                similar = False
                if dist == 1 and word[:3] != alt[:3]:
                    similar = True
                elif dist == 2 and word[:5] == alt[:5] and word[-2:] == alt[
                        -2:]:
                    similar = True
                if similar and searcher.CheckExact(alt):
                    word_prob = float(phrase_dbm.GetStr(word) or "0")
                    alt_prob = float(phrase_dbm.GetStr(alt) or "0")
                    if alt_prob > word_prob * 2:
                        normals.append(alt)
            parents = []
            for parent in entry.get("parent") or []:
                parent_entries = searcher.SearchBody(parent)
                if not parent_entries: continue
                parent_prob = 0
                for parent_entry in parent_entries:
                    if parent_entry["word"] != parent: continue
                    parent_prob = float(parent_entry["probability"] or "0")
                parents.append(parent)
            for parent in parent_index.get(word) or []:
                if parent not in parents:
                    parents.append(parent)
            if parents:
                weight = 1 / (min(len(parents), 5) + 1)
                for parent in parents:
                    rel_words[parent] = max(rel_words.get(parent) or 0, weight)
                    weight *= 0.9
            children = entry.get("child") or []
            if len(word) >= 5:
                for phrase in entry.get("phrase") or []:
                    phrase_word = phrase["w"]
                    if not phrase_word.startswith(word): continue
                    if phrase_word.endswith("ing") or phrase_word.endswith(
                            "ed"):
                        children.append(phrase_word)
            if children:
                weight = 1 / (min(len(parents), 5) + 2)
                for child in children:
                    rel_words[child] = max(rel_words.get(child) or 0, weight)
                    parent_index[child].append(word)
                    weight *= 0.9
            related = entry.get("related") or []
            if related:
                weight = 1 / (min(len(parents), 5) + 2)
                for rel_word in related:
                    rel_words[rel_word] = max(
                        rel_words.get(rel_word) or 0, weight)
                    weight *= 0.9
            synonyms = {}
            hypernyms = {}
            hyponyms = {}
            antonyms = {}
            similars = {}
            item_weight = 1.0
            for item in entry["item"]:
                if item["label"] != "wn": continue
                hit = False
                text = item["text"]
                for part in text.split("[-]"):
                    match = regex.search(r"\[([a-z]+)\]: (.*)", part.strip())
                    if match:
                        if match.group(1) == "synonym":
                            res_words = synonyms
                        elif match.group(1) == "hypernym":
                            res_words = hypernyms
                        elif match.group(1) == "hyponym":
                            res_words = hyponyms
                        elif match.group(1) == "antonym":
                            res_words = antonyms
                        elif match.group(1) == "similar":
                            res_words = similars
                        else:
                            continue
                        order_weight = 1.0
                        for rel_word in match.group(2).split(","):
                            rel_word = rel_word.strip()
                            if rel_word:
                                weight = item_weight * order_weight
                                res_words[rel_word] = max(
                                    res_words.get(rel_word) or 0, weight)
                                order_weight *= 0.95
                                hit = True
                if hit:
                    item_weight *= 0.95
            voted_words = set()
            for cand_words, penalty, propagate in [(synonyms, 2, True),
                                                   (hypernyms, 2, True),
                                                   (hyponyms, 3, False),
                                                   (antonyms, 3, False),
                                                   (similars, 3, False)]:
                if not cand_words: continue
                type_weight = 1 / (math.log(len(cand_words)) + penalty)
                for cand_word, cand_weight in cand_words.items():
                    weight = cand_weight * type_weight
                    if cand_word in voted_words: continue
                    voted_words.add(cand_word)
                    features[cand_word] = (features.get(cand_word)
                                           or 0) + weight * 0.5
                    if propagate:
                        rel_words[cand_word] = max(
                            rel_words.get(cand_word) or 0, weight)
            for rel_word, weight in rel_words.items():
                AddFeatures(searcher, rel_word, weight, features)
            features.pop(word, None)
            features.pop("wikipedia", None)
            merged_features = {}
            for label, score in features.items():
                if regex.search(r"[\p{Han}\p{Katakana}\p{Hiragana}]", label):
                    label = NormalizeTran(label)
                    label = regex.sub(
                        r"[\p{Hiragana}]*(\p{Han})[\p{Hiragana}]*(\p{Han}).*",
                        r"\1\2", label)
                    label = regex.sub(r"([\p{Katakana}ー]{2,})\p{Hiragana}.*",
                                      r"\1", label)
                    label = regex.sub(r"\p{Hiragana}+([\p{Katakana}ー]{2,})",
                                      r"\1", label)
                merged_features[label] = max(
                    merged_features.get(label) or 0, score)
            features = [
                x for x in merged_features.items() if not x[0].startswith("__")
            ]
            gb_words = set()
            rel_words = [x[0] for x in features]
            rel_words.append(word)
            for rel_word in rel_words:
                for gb_suffix, us_suffix in suffix_pairs:
                    if rel_word.endswith(us_suffix):
                        gb_word = rel_word[:-len(us_suffix)] + gb_suffix
                        gb_words.add(gb_word)
            if not features: continue
            max_score = max(features, key=lambda x: x[1])[1]
            mod_features = []
            for label, score in features:
                if len(mod_features) >= 128: break
                if label in gb_words: continue
                score /= max_score
                mod_features.append((label, score))
            mod_features = sorted(mod_features,
                                  key=lambda x: x[1],
                                  reverse=True)
            fields = [word]
            fields.append(",".join(normals))
            fields.append(",".join(parents))
            fields.append(",".join(children))
            fields.append("{:.6f}".format(prob))
            for label, score in mod_features[:100]:
                fields.append(label)
                fields.append("{:.3f}".format(score))
            print("\t".join(fields))
        page_index += 1
    phrase_dbm.Close().OrDie()