def compute_translation_statistics(tr, translation_lengths, long_trs, very_long_trs, translation_vocab): """ Compute statistics related to translation :param tr: current translation :param translation_lengths: list of all translation lengths :param long_trs: counter for long translations :param very_long_trs: counter for very long translations :param translation_vocab: vocabulary of all the words in different translations :return: the four last parameters to the function after updated for current translation """ translation_lengths.append(len(tr.split())) if len(tr.split()) > 50: long_trs += 1 if len(tr.split()) > 200: very_long_trs += 1 for word in tr.split(): word = word.replace(",", "").replace("!", "").replace("?", "").replace( ":", "").replace(";", "") if word.replace(".", "") == "": word = "..." else: word = word.replace(".", "") increment_count(translation_vocab, word) return translation_lengths, long_trs, very_long_trs, translation_vocab
def hmm_preprocess(train_sents): """ train the HMM model :param train_sents: train sentences for the model :return: counts of unigrams, bigrams and trigrams """ print("Start training") total_tokens = 0 q_tri_counts, q_bi_counts, q_uni_counts, e_word_tag_counts, e_tag_counts = {}, {}, {}, {}, {} # e_tag_counts for sentence in train_sents: for token in sentence: key = token[1] increment_count(e_tag_counts, key) # e_word_tag_counts for sentence in train_sents: for token in sentence: key = token increment_count(e_word_tag_counts, key) # New update to enhance performance. most_common_tag = {} for word, tag in e_word_tag_counts: if word not in most_common_tag: most_common_tag[word] = (tag, e_word_tag_counts[word, tag]) elif e_word_tag_counts[word, tag] > most_common_tag[word][1]: most_common_tag[word] = (tag, e_word_tag_counts[word, tag]) most_common_tag["default"] = max(e_tag_counts, key=e_tag_counts.get) # Add *, * to beginning of every sentence and STOP to every end. adjusted_sents = [] for sentence in train_sents: adjusted_sentence = [] adjusted_sentence.append(('<s>', '<s>')) adjusted_sentence.append(('<s>', '<s>')) for token in sentence: adjusted_sentence.append(token) adjusted_sentence.append(('</s>', '</s>')) adjusted_sents.append(adjusted_sentence) # total_tokens for sentence in adjusted_sents: total_tokens += (len(sentence) - 2) # q_uni_counts for sentence in adjusted_sents: for token in sentence: key = token[1] increment_count(q_uni_counts, key) # q_bi_counts for sentence in adjusted_sents: for i in range(1, len(sentence)): key = (sentence[i - 1][1], sentence[i][1]) increment_count(q_bi_counts, key) # q_tri_counts for sentence in adjusted_sents: for i in range(2, len(sentence)): key = (sentence[i - 2][1], sentence[i - 1][1], sentence[i][1]) increment_count(q_tri_counts, key) # possible tags possible_tags = {} for sentence in train_sents: for token in sentence: if token[0] in possible_tags: possible_tags[token[0]].add(token[1]) else: possible_tags[token[0]] = {token[1]} return total_tokens, q_tri_counts, q_bi_counts, q_uni_counts, e_word_tag_counts, e_tag_counts, most_common_tag, \ possible_tags
def add_translation_to_file(prev_signs, signs_vocab, prev_transcription, transcription_vocab, prev_tr, translation_lengths, long_trs, very_long_trs, translation_vocab, prev_text, prev_start_line, prev_end_line, signs_file, transcription_file, translation_file, could_divide_by_three_dots, could_not_divide, metadata=False, divide_by_three_dots=True): """ Add a translation with corresponding signs and transliterations to files :param prev_signs: previous signs written to file :param signs_vocab: vocabulary of all the signs :param prev_transcription: previous transliterations written to file :param transcription_vocab: vocabulary of all the transliterations :param prev_tr: previous translation written to file :param translation_lengths: list of all translation lengths :param long_trs: counter for long translations :param very_long_trs: counter for very long translations :param translation_vocab: vocabulary of all the words in different translations :param prev_text: previous text written to file :param prev_start_line: previous start line written to file :param prev_end_line: previous end line written to file :param signs_file: file of all signs, being built as input for translation algorithms :param transcription_file: file of all transliterations, being built as input for translation algorithms :param translation_file: file of all translations, being built as input for translation algorithms :param could_divide_by_three_dots: counter for translations possible to divide based on three dots :param could_not_divide: counter for translations not possible to divide based on three dots :param metadata: should add the id of each sample to the files :return: some of the parameters to the function, after update """ signs = "" transcription = "" for sign in prev_signs: signs += sign increment_count(signs_vocab, sign) for t, delim in prev_transcription: transcription += t + delim increment_count(transcription_vocab, t) signs = clean_signs_transcriptions(signs, True) transcription = clean_signs_transcriptions(transcription, False) real_key = [ prev_text + "." + str(prev_start_line), prev_text + "." + str(prev_end_line) ] splitted_signs = [s for s in signs.split("...") if s != "" and s != " "] splitted_transcription = [ t for t in transcription.split("... ") if t != "" and t != " " ] splitted_translation = [ tr for tr in prev_tr.split("... ") if tr != "" and tr != " " ] # Write to files if len(splitted_signs) == len(splitted_transcription) and len(splitted_transcription) == len(splitted_translation) \ and divide_by_three_dots: could_divide_by_three_dots += 1 for i in range(len(splitted_signs)): if metadata: signs_file.write( str(real_key) + "[" + str(i + 1) + "]: " + splitted_signs[i] + "\n") transcription_file.write( str(real_key) + "[" + str(i + 1) + "]: " + splitted_transcription[i] + "\n") translation_file.write( str(real_key) + "[" + str(i + 1) + "]: " + splitted_translation[i] + "\n") else: signs_file.write(splitted_signs[i] + "\n") transcription_file.write(splitted_transcription[i] + "\n") translation_file.write(splitted_translation[i] + "\n") translation_lengths, long_trs, very_long_trs, translation_vocab = \ compute_translation_statistics(splitted_translation[i], translation_lengths, long_trs, very_long_trs, translation_vocab) else: could_not_divide += 1 if metadata: signs_file.write(str(real_key) + ": " + signs + "\n") transcription_file.write( str(real_key) + ": " + transcription + "\n") translation_file.write(str(real_key) + ": " + prev_tr + "\n") else: signs_file.write(signs + "\n") transcription_file.write(transcription + "\n") translation_file.write(prev_tr + "\n") translation_lengths, long_trs, very_long_trs, translation_vocab = \ compute_translation_statistics(prev_tr, translation_lengths, long_trs, very_long_trs, translation_vocab) return signs_vocab, transcription_vocab, translation_lengths, long_trs, very_long_trs, translation_vocab, \ could_divide_by_three_dots, could_not_divide
def build_extra_decoding_arguments(train_sents): """ Builds arguements for HMM, MEMM and BiLSTM (unigram, bigram, trigram, etc) :param train_sents: all sentences from training set :return: all extra arguments which your decoding procedures requires """ extra_decoding_arguments = {} START_WORD, STOP_WORD = '<st>', '</s>' START_TAG, STOP_TAG = '*', 'STOP' e_word_tag_counts, e_tag_counts = {}, {} possible_tags = {} for sentence in train_sents: for token in sentence: if token[0] in possible_tags: possible_tags[token[0]].add(token[1]) else: possible_tags[token[0]] = {token[1]} extra_decoding_arguments['possible_tags'] = possible_tags # New update to enhance performance. global most_common_tag most_common_tag = {} for word, tag in e_word_tag_counts: if word not in most_common_tag: most_common_tag[word] = (tag, e_word_tag_counts[word, tag]) elif e_word_tag_counts[word, tag] > most_common_tag[word][1]: most_common_tag[word] = (tag, e_word_tag_counts[word, tag]) adjusted_sents = [] for sentence in train_sents: adjusted_sentence = [] adjusted_sentence.append((START_WORD, START_TAG)) adjusted_sentence.append((START_WORD, START_TAG)) for token in sentence: adjusted_sentence.append(token) adjusted_sentence.append((STOP_WORD, STOP_TAG)) adjusted_sents.append(adjusted_sentence) q_tri_counts, q_bi_counts, q_uni_counts = {}, {}, {} # q_uni_counts for sentence in adjusted_sents: for token in sentence: key = token[1] increment_count(q_uni_counts, key) S = q_uni_counts.keys() # q_bi_counts for sentence in adjusted_sents: for i in range(1, len(sentence)): key = (sentence[i - 1][1], sentence[i][1]) increment_count(q_bi_counts, key) # q_tri_counts for sentence in adjusted_sents: for i in range(2, len(sentence)): key = (sentence[i - 2][1], sentence[i - 1][1], sentence[i][1]) increment_count(q_tri_counts, key) extra_decoding_arguments['S'] = S cache_probability = {} extra_decoding_arguments['cache'] = cache_probability return extra_decoding_arguments