def __init__(self, bigram_count_file_name, bigram_model_pickle='bigram_model.pickle'): self.dictionary = ourdict.dictionary self.tokenizer = Tokenizer() with open(bigram_count_file_name, 'r') as f: self.bigram_model = LaplaceBigramModel(f) with open('windows1.txt', 'r') as f: self.unigram_model = LaplaceUnigramModel(f)
class Translator: def __init__(self, bigram_count_file_name, bigram_model_pickle='bigram_model.pickle'): self.dictionary = ourdict.dictionary self.tokenizer = Tokenizer() with open(bigram_count_file_name, 'r') as f: self.bigram_model = LaplaceBigramModel(f) with open('windows1.txt', 'r') as f: self.unigram_model = LaplaceUnigramModel(f) def translate_sentence(self, sentence): ################################################################################### # Call PRE-processing rules as functions of sentence and returning sentences HERE # ################################################################################### tokens = self.tokenizer.tokenize(sentence) ############################################################### # or as functions of token list and returning toke list HERE. # tokens = self.remove_se(tokens) ############################################################### translated_tokens = ['^'] # Arbitrary start for i in range(0, len(tokens)): token = tokens[i].lower() if (token == "para"): translated_tokens.append( self.para_process(token, tokens[i + 1])) else: translated_tokens.append( self.find_next_word(token, translated_tokens)) # for token in tokens: # token = token.lower() # translated_tokens.append(self.find_next_word(token, translated_tokens)) ####################################################################################### # Call POST-processing rules as functions of token list and returning token list HERE # ####################################################################################### translation = self.format(translated_tokens) ########################################################### # or as functions of sentence and returning sentence HERE # translation = self.reverse_noun_adj([translation]) # ########################################################### return translation def para_process(self, para, next_word): if (len(next_word) > 1): suffix = next_word[len(next_word) - 2:] print "suffix: %s" % suffix if suffix == 'ar' or suffix == 'er' or suffix == 'ir': return 'to' else: return 'for' return 'for' def find_next_word(self, word, current_translation): candidate_words = self.dictionary[word] top_score = float("-inf") prev_word = current_translation[-1] if (prev_word == ',') or (prev_word == '.'): prev_word = current_translation[ -2] # If the previous token is punctuation, get what's before it for word in candidate_words: # score = self.bigram_model.score([prev_word, word]) score = self.bigram_model.score( [prev_word, word]) + self.unigram_model.score([word]) if (score > top_score): best = word top_score = score return best def format(self, token_list): ''' takes the list of translated words and formats it nicely for printing ''' s = " ".join( token_list[1:] ) # Remove the leading start token and turn into a spaced string s = re.sub(r' ([\.,])', r'\1', s) # Remove whitespace before punctuation s = s[0].upper() + s[1:] # Capitalize the sentence return s ########################################################### # ADD YOUR PREPROCESSING + POSTPROCESSING FUNCTIONS HERE. # ########################################################### def reverse_noun_adj(self, s): noun_tags = set(['NNP', 'NN', 'NNS']) adj_tags = set(['JJ']) parsed = parse.parse_english(s)[0] # print parsed words = parsed.words for i in range(len(words) - 1): if parsed.tags[i] in noun_tags: if parsed.tags[i + 1] in adj_tags: w = words[i] words[i] = words[i + 1] words[i + 1] = w print ">>>> SWITCHED %s and %s" % (w, words[i]) words = ['^'] + words # stupid hack to make the formatting work # print words s = self.format(words) return s def remove_se(self, spanish_tokens): new_tokens = [] for t in spanish_tokens: if t != "se": new_tokens.append(t) else: new_tokens.append("usted") return new_tokens
class Translator: def __init__(self, bigram_count_file_name, bigram_model_pickle='bigram_model.pickle'): self.dictionary = ourdict.dictionary self.tokenizer = Tokenizer() with open(bigram_count_file_name, 'r') as f: self.bigram_model = LaplaceBigramModel(f) with open('windows1.txt', 'r') as f: self.unigram_model = LaplaceUnigramModel(f) def translate_sentence(self, sentence): #PRE-processing# tokens = self.tokenizer.tokenize(sentence) tokens = self.remove_se(tokens) #Processing# translated_tokens = ['^'] # Arbitrary start for i in range(0, len(tokens)): token = tokens[i].lower() if (token == "para"): translated_tokens.append(self.para_process(token, tokens[i + 1])) else: translated_tokens.append(self.find_next_word(token, translated_tokens)) #POST-processing# translation = self.format(translated_tokens) return translation def para_process(self, para, next_word): if (len(next_word) > 1): suffix = next_word[len(next_word)-2:] print "suffix: %s" % suffix if suffix == 'ar' or suffix == 'er' or suffix == 'ir': return 'to' else: return 'for' return 'for' def find_next_word(self, word, current_translation): candidate_words = self.dictionary[word] top_score = float("-inf") prev_word = current_translation[-1] if (prev_word == ',') or (prev_word == '.'): prev_word = current_translation[-2] # If the previous token is punctuation, get what's before it for word in candidate_words: score = self.bigram_model.score([prev_word, word]) + self.unigram_model.score([word]) if (score > top_score): best = word top_score = score return best def format(self, token_list): ''' takes the list of translated words and formats it nicely for printing ''' s = " ".join(token_list[1:]) # Remove the leading start token and turn into a spaced string s = re.sub(r' ([\.,])', r'\1', s) # Remove whitespace before punctuation s = s[0].upper() + s[1:] # Capitalize the sentence return s def reverse_noun_adj(self, s): noun_tags = set(['NNP', 'NN', 'NNS']) adj_tags = set(['JJ']) parsed = parse.parse_english(s)[0] words = parsed.words for i in range(len(words)-1): if parsed.tags[i] in noun_tags: if parsed.tags[i+1] in adj_tags: w = words[i] words[i] = words[i+1] words[i+1] = w words = ['^'] + words # makes the formatting work s = self.format(words) return s def remove_se(self, spanish_tokens): new_tokens = [] for t in spanish_tokens: if t != "se": new_tokens.append(t) else: new_tokens.append("usted") return new_tokens