def __init__(self, input_path, output_path, supplement_labels, tran_prob_path): self.input_path = input_path self.output_path = output_path self.supplement_labels = supplement_labels self.tran_prob_path = tran_prob_path self.tokenizer = tkrzw_tokenizer.Tokenizer()
def Run(self): tokenizer = tkrzw_tokenizer.Tokenizer() start_time = time.time() logger.info( "Process started: input_path={}, output_path={}, wnjpn_path={}". format(self.input_path, self.output_path, self.wnjpn_path)) wnjpn_trans = self.ReadTranslations() aux_trans, subaux_trans = self.ReadAuxTranslations() synset_index = self.ReadSynsetIndex() self.AppendTranslations(wnjpn_trans, aux_trans, subaux_trans, synset_index) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def Run(self): tokenizer = tkrzw_tokenizer.Tokenizer() start_time = time.time() logger.info("Process started: input_path={}, output_path={}, wnjpn_path={}".format( self.input_path, self.output_path, self.wnjpn_path)) wnjpn_trans = self.ReadTranslations() if self.feedback_path: feedback_trans = self.ReadFeedbackTranslations() else: feedback_trans = None aux_trans, subaux_trans, tran_thes = self.ReadAuxTranslations() synset_index = self.ReadSynsetIndex() tran_index = {} tran_index = self.ReadTranIndex(synset_index) self.AppendTranslations( wnjpn_trans, feedback_trans, aux_trans, subaux_trans, tran_thes, synset_index, tran_index) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def __init__(self, input_path, output_path, supplement_labels, tran_prob_path, phrase_prob_path, rev_prob_path, yomi_first_paths, yomi_second_paths, tran_aux_paths, conj_verb_path, conj_adj_path, title): self.input_path = input_path self.output_path = output_path self.supplement_labels = supplement_labels self.tran_prob_path = tran_prob_path self.phrase_prob_path = phrase_prob_path self.rev_prob_path = rev_prob_path self.yomi_first_paths = yomi_first_paths self.yomi_second_paths = yomi_second_paths self.tran_aux_paths = tran_aux_paths self.conj_verb_path = conj_verb_path self.conj_adj_path = conj_adj_path self.title = title self.tokenizer = tkrzw_tokenizer.Tokenizer() self.num_words = 0 self.num_items = 0
def main(): args = sys.argv[1:] language = tkrzw_dict.GetCommandFlag(args, "--language", 1) or "en" lowering = tkrzw_dict.GetCommandFlag(args, "--lower", 0) stemming = tkrzw_dict.GetCommandFlag(args, "--stem", 0) max_sentences = int( tkrzw_dict.GetCommandFlag(args, "--max_sentences", 1) or "1000000") with_middle = tkrzw_dict.GetCommandFlag(args, "--middle", 0) with_readable = tkrzw_dict.GetCommandFlag(args, "--readable", 0) if tkrzw_dict.GetCommandFlag(args, "--quiet", 0): logger.setLevel(logging.ERROR) if args: raise RuntimeError("unknown arguments: {}".format(str(args))) logger.info( ("Process started: language={}, lower={}, stem={}, max_sentences={}" ", middle={}, readable={}").format(language, lowering, stemming, max_sentences, with_middle, with_readable)) tokenizer = tkrzw_tokenizer.Tokenizer() count = 0 num_records, num_sentences, num_words = 0, 0, 0 for line in sys.stdin: line = line.strip() if not line: continue count += 1 stats = ProcessTSV(tokenizer, language, lowering, stemming, max_sentences, with_middle, with_readable, line) if stats: num_records += 1 num_sentences += stats[0] num_words += stats[1] if count % 1000 == 0: logger.info( "Processing: {} input records, {} output records, {} sentences, {} words" .format(count, num_records, num_sentences, num_words)) logger.info( "Process done: {} input records, {} output records, {} sentences, {} words" .format(count, num_records, num_sentences, num_words))
def __init__(self, input_path, output_path, keyword_path, best_labels, vetted_labels, preferable_labels, trustable_labels, supplement_labels, title, min_prob_normal, min_prob_capital, min_prob_multi, sufficient_prob, shrink): self.input_path = input_path self.output_path = output_path self.keyword_path = keyword_path self.best_labels = best_labels self.vetted_labels = vetted_labels self.preferable_labels = preferable_labels self.trustable_labels = trustable_labels self.supplement_labels = supplement_labels self.title = title self.min_prob_normal = min_prob_normal self.min_prob_capital = min_prob_capital self.min_prob_multi = min_prob_multi self.sufficient_prob = sufficient_prob self.shrink = shrink self.num_words = 0 self.num_trans = 0 self.num_items = 0 self.num_aux_items = 0 self.label_counters = collections.defaultdict(int) self.tokenizer = tkrzw_tokenizer.Tokenizer()
def AppendTranslations(self, wnjpn_trans, aux_trans, subaux_trans, synset_index): start_time = time.time() logger.info( "Appending translations: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() phrase_prob_dbm = None if self.phrase_prob_path: phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if self.rev_prob_path: rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie() tokenizer = tkrzw_tokenizer.Tokenizer() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() output_dbm = tkrzw.DBM() num_buckets = input_dbm.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() num_words = 0 num_orig_trans = 0 num_match_trans = 0 num_voted_trans = 0 num_items = 0 num_items_bare = 0 num_items_rescued = 0 it = input_dbm.MakeIterator() it.First() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) items = entry["item"] spell_ratios = {} for item in items: word = item["word"] phrase_prob = float(item.get("prob") or 0.0) spell_ratios[word] = phrase_prob + 0.00000001 sum_prob = 0.0 for word, prob in spell_ratios.items(): sum_prob += prob for word, prob in list(spell_ratios.items()): spell_ratios[word] = prob / sum_prob for item in items: word = item["word"] pos = item["pos"] synset = item["synset"] links = item.get("link") or {} phrase_prob = float(item.get("prob") or 0.0) spell_ratio = spell_ratios[word] synonyms = item.get("synonym") or [] hypernyms = item.get("hypernym") or [] hyponyms = item.get("hyponym") or [] similars = item.get("similar") or [] derivatives = item.get("derivative") or [] synonym_ids = links.get("synonym") or [] hypernym_ids = links.get("hypernym") or [] hyponym_ids = links.get("hyponym") or [] similar_ids = links.get("similar") or [] derivative_ids = links.get("derivative") or [] item_tran_pairs = wnjpn_trans.get(synset) or [] item_aux_trans = aux_trans.get(word) or [] item_aux_trans.extend(subaux_trans.get(word) or []) self.NormalizeTranslationList(tokenizer, pos, item_aux_trans) item_trans = [] hand_trans = set() for tran, src in item_tran_pairs: if src == "mono": hit = False for item_aux_tran in item_aux_trans: dist = tkrzw.Utility.EditDistanceLev( tran, item_aux_tran) dist_ratio = dist / max(len(tran), len(item_aux_tran)) if dist < 0.3: hit = True if not hit: continue item_trans.append(tran) if src == "hand": hand_trans.add(tran) self.NormalizeTranslationList(tokenizer, pos, item_trans) num_items += 1 bare = not item_trans if bare: num_items_bare += 1 num_orig_trans += len(item_trans) syno_tran_counts = collections.defaultdict(int) hyper_tran_counts = collections.defaultdict(int) hypo_tran_counts = collections.defaultdict(int) similar_tran_counts = collections.defaultdict(int) derivative_tran_counts = collections.defaultdict(int) aux_trans_set = set(item_aux_trans) checked_words = set() checked_ids = set([synset]) voted_rel_words = set() voted_rel_records = set() for rel_words, rel_ids, tran_counts in ( (synonyms, synonym_ids, syno_tran_counts), (hypernyms, hypernym_ids, hyper_tran_counts), (hyponyms, hyponym_ids, hypo_tran_counts), (similars, similar_ids, similar_tran_counts), (derivatives, derivative_ids, derivative_tran_counts)): for rel_word in rel_words: is_similar = self.AreSimilarWords(rel_word, word) rel_phrase_prob = 0.0 if phrase_prob_dbm: rel_phrase_prob = self.GetPhraseProb( phrase_prob_dbm, tokenizer, "en", rel_word) mean_prob = (phrase_prob * rel_phrase_prob)**0.5 rel_aux_trans = [] if rel_word not in checked_words: checked_words.add(rel_word) tmp_aux_trans = aux_trans.get(rel_word) if tmp_aux_trans: rel_aux_trans.extend(tmp_aux_trans) for rel_id in synset_index[rel_word]: if rel_id not in rel_ids: continue if rel_id not in checked_ids: checked_ids.add(rel_id) tmp_aux_trans = wnjpn_trans.get(rel_id) if tmp_aux_trans: tmp_aux_trans = [ x[0] for x in tmp_aux_trans ] rel_aux_trans.extend(tmp_aux_trans) if rel_aux_trans: self.NormalizeTranslationList( tokenizer, pos, rel_aux_trans) if not is_similar and mean_prob < 0.0005: for item_aux_tran in item_aux_trans: if regex.fullmatch(r"[\p{Hiragana}]{,3}", item_aux_tran): continue if item_aux_tran in rel_aux_trans: valid_pos = self.IsValidPosTran( tokenizer, pos, item_aux_tran) if valid_pos and item_aux_tran not in item_trans: item_trans.append(item_aux_tran) num_match_trans += 1 if mean_prob < 0.005: voted_top = rel_word for voted_rel_word in voted_rel_words: if self.AreSimilarWords( rel_word, voted_rel_word): voted_top = voted_rel_word break voted_rel_words.add(rel_word) for rel_aux_tran in set(rel_aux_trans): voted_record = (voted_top, rel_aux_tran) if voted_record in voted_rel_records: continue voted_rel_records.add(voted_record) tran_counts[rel_aux_tran] += 1 if bare: for deri_tran, count in derivative_tran_counts.items(): syno_tran_counts[ deri_tran] = syno_tran_counts[deri_tran] + count derivative_tran_counts.clear() for syno_tran, count in syno_tran_counts.items(): if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran): continue if syno_tran in hyper_tran_counts: count += 1 if syno_tran in hypo_tran_counts: count += 1 if syno_tran in similar_tran_counts: count += 1 if syno_tran in derivative_tran_counts: count += 1 if bare and syno_tran in aux_trans_set: count += 1 if count >= 3 and syno_tran not in item_trans: valid_pos = self.IsValidPosTran( tokenizer, pos, syno_tran) if valid_pos and syno_tran not in item_trans: item_trans.append(syno_tran) num_voted_trans += 1 item_score = 0.0 if item_trans: if bare: num_items_rescued += 1 if rev_prob_dbm or tran_prob_dbm: item_trans, item_score, tran_scores = ( self.SortWordsByScore(word, item_trans, hand_trans, rev_prob_dbm, tokenizer, tran_prob_dbm)) item[ "translation"] = item_trans[:MAX_TRANSLATIONS_PER_WORD] if tran_scores: tran_score_map = {} for tran, tran_score in tran_scores[: MAX_TRANSLATIONS_PER_WORD]: tran_score_map[tran] = "{:.6f}".format( tran_score).replace("0.", ".") item["translation_score"] = tran_score_map item_score += spell_ratio * 0.5 item["score"] = "{:.8f}".format(item_score).replace("0.", ".") if "link" in item: del item["link"] if rev_prob_dbm: entry["item"] = sorted( items, key=lambda item: float(item.get("score") or 0.0), reverse=True) serialized = json.dumps(entry, separators=(",", ":"), ensure_ascii=False) output_dbm.Set(key, serialized).OrDie() num_words += 1 if num_words % 10000 == 0: logger.info("Saving words: words={}".format(num_words)) it.Next() output_dbm.Close().OrDie() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if rev_prob_dbm: rev_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() input_dbm.Close().OrDie() logger.info( "Aappending translations done: words={}, elapsed_time={:.2f}s". format(num_words, time.time() - start_time)) logger.info( "Stats: orig={}, match={}, voted={}, items={}, bare={}, rescued={}" .format(num_orig_trans, num_match_trans, num_voted_trans, num_items, num_items_bare, num_items_rescued))
def main(): args = sys.argv[1:] if len(args) < 1: raise ValueError("invalid arguments") input_path = args[0] is_synset = False for arg in args[1:]: if arg == "--synset": is_synset = True else: raise ValueError("invalid arguments") tokenizer = tkrzw_tokenizer.Tokenizer() dbm = tkrzw.DBM() dbm.Open(input_path, False).OrDie() it = dbm.MakeIterator() it.First().OrDie() while True: record = it.GetStr() if not record: break; key, data = record entries = json.loads(data) for entry in entries: word = entry["word"] if is_synset: for item in entry["item"]: pos = item["pos"] text = item["text"] syn_match = regex.search(r"\[synset\]: ([-0-9a-z]+)", text) tran_match = regex.search(r"\[translation\]: ([^\[]+)", text) if syn_match and tran_match: syn = syn_match.group(1) tran = tran_match.group(1) tran = regex.sub(r"\([^)]+\)", "", tran) norm_trans = [] uniq_trans = set() for syn_tran in tran.split(","): norm_tran = tokenizer.NormalizeJaWordForPos(pos, syn_tran.strip()) if norm_tran and norm_tran not in uniq_trans: norm_trans.append(norm_tran) uniq_trans.add(norm_tran) if norm_trans: print("{}:{}\t{}".format(word, syn, "\t".join(norm_trans))) else: poses = set() tran_poses = {} for item in entry["item"]: pos = item["pos"] text = item["text"] poses.add(pos) tran_match = regex.search(r"\[translation\]: ([^\[]+)", text) if tran_match: tran = tran_match.group(1) tran = regex.sub(r"\([^)]+\)", "", tran) for syn_tran in tran.split(","): syn_tran = syn_tran.strip() if syn_tran and syn_tran not in tran_poses: tran_poses[syn_tran] = pos only_pos = list(poses)[0] if len(poses) == 1 else None translations = entry.get("translation") if translations: norm_trans = [] uniq_trans = set() for tran in translations: pos = only_pos if not pos: pos = tran_poses.get(tran) norm_tran = tokenizer.NormalizeJaWordForPos(pos, tran) if pos else tran if norm_tran and norm_tran not in uniq_trans: norm_trans.append(norm_tran) uniq_trans.add(norm_tran) if norm_trans: print("{}\t{}".format(word, "\t".join(norm_trans))) it.Next() dbm.Close().OrDie()
def __init__(self, input_path, output_path, rev_prob_path): self.input_path = input_path self.rev_prob_path = rev_prob_path self.output_path = output_path self.tokenizer = tkrzw_tokenizer.Tokenizer()
for target, score, ef_prob, fe_prob in good_targets[:max_targets]: if rev_prob_dbm: prob = GetPhraseProb(rev_prob_dbm, "ja", target) if prob < MIN_PROB: continue #outputs.append("{}:{:.3f}:{:.3f}:{:.3f}".format(target, score, ef_prob, fe_prob)) outputs.append(target) if outputs: print("{}\t{}".format(source, "\t".join(outputs))) if rev_prob_dbm: rev_prob_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format( time.time() - start_time)) tokenizer = tkrzw_tokenizer.Tokenizer() def GetPhraseProb(prob_dbm, language, word): base_prob = 0.000000001 tokens = tokenizer.Tokenize(language, word, False, True) if not tokens: return base_prob max_ngram = min(3, len(tokens)) fallback_penalty = 1.0 for ngram in range(max_ngram, 0, -1): if len(tokens) <= ngram: cur_phrase = " ".join(tokens) prob = float(prob_dbm.GetStr(cur_phrase) or 0.0) if prob: return max(prob, base_prob) fallback_penalty *= 0.1 else: probs = []
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths, yomi_paths, min_phrase_prob, min_tran_prob): logger.info("Start the process") phrase_prob_dbm = None if phrase_prob_path: logger.info("Opening the phrase prob DB: " + phrase_prob_path) phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if rev_prob_path: logger.info("Opening the reverse prob DB: " + rev_prob_path) rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie() tran_prob_dbm = None if tran_prob_path: logger.info("Opening the tran prob DB: " + tran_prob_path) tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie() aux_trans = collections.defaultdict(list) for tran_aux_path in tran_aux_paths.split(","): tran_aux_path = tran_aux_path.strip() if tran_aux_path: logger.info("Reading the tran aux file: " + tran_aux_path) with open(tran_aux_path) as input_file: uniq_keys = set() for line in input_file: fields = line.strip().split("\t") if len(fields) < 2: continue word = fields[0] for tran in fields[1:]: uniq_key = word + ":" + tran if uniq_key in uniq_keys: continue aux_trans[word].append(tran) uniq_keys.add(uniq_key) yomis = set() for yomi_path in yomi_paths.split(","): yomi_path = yomi_path.strip() if yomi_path: logger.info("Reading the yomi file: " + yomi_path) with open(yomi_path) as input_file: for line in input_file: fields = line.strip().split("\t") if len(fields) < 1: continue yomis.add(fields[0]) logger.info("Processing the gross.") tokenizer = tkrzw_tokenizer.Tokenizer() word_dict = collections.defaultdict(list) alt_source = None alt_targets = None num_lines = 0 for line in sys.stdin: num_lines += 1 if num_lines % 10000 == 0: logger.info("Processing the gross: {} lines: {} items".format( num_lines, len(word_dict))) fields = line.strip().split("\t") if len(fields) != 3: continue word, pos, text = fields if pos == "alternative": alt_source = word alt_targets = set() for alt in regex.split(r"[,;]", text): if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+", alt): alt_targets.add(alt) continue text = regex.sub(r"\.$", "", text).strip() for tran in regex.split(r"[,;]", text): tran = tran.strip() if pos == "verb": tran = regex.sub(r"^to ", "", tran) if pos == "noun": tran = regex.sub(r"^(a|an|the) ", "", tran) tran = regex.sub("^[-~] ", "", tran) tran = regex.sub(" [-~]$", "", tran) if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue tokens = tran.split(" ") if len(tokens) < 1 or len(tokens) > 4: continue word_dict[tran].append((pos, word)) if alt_source == word: for alt in alt_targets: word_dict[tran].append((pos, alt)) norm_word_dict = collections.defaultdict(list) for word, trans in word_dict.items(): scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer, phrase_prob_dbm, rev_prob_dbm, tran_prob_dbm, aux_trans, yomis, min_phrase_prob, min_tran_prob) if scored_trans: key = tkrzw_dict.NormalizeWord(word) norm_word_dict[key].append((word, scored_trans, phrase_prob)) for key, entries in norm_word_dict.items(): sum_phrase_prob = 0.0 for word, scored_trans, phrase_prob in entries: sum_phrase_prob += phrase_prob for word, scored_trans, phrase_prob in entries: if sum_phrase_prob > 0: if key == word: if phrase_prob / sum_phrase_prob < 0.6: continue else: if phrase_prob / sum_phrase_prob < 0.8: continue PrintEntry(word, scored_trans) if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() logger.info("Process done")
def __init__(self, data_prefix, language): self.language = language self.tokenizer = tkrzw_tokenizer.Tokenizer() word_score_path = tkrzw_dict.GetCoocScorePath(data_prefix) self.word_score_dbm = tkrzw.DBM() self.word_score_dbm.Open(word_score_path, False, dbm="HashDBM").OrDie()