Python Tokenizer Examples, nlp_tools.Tokenizer Python Examples

Example #1

0

Show file

File: Translator.py Project: vvperks/124

 def __init__(self,
              bigram_count_file_name,
              bigram_model_pickle='bigram_model.pickle'):
     self.dictionary = ourdict.dictionary
     self.tokenizer = Tokenizer()
     with open(bigram_count_file_name, 'r') as f:
         self.bigram_model = LaplaceBigramModel(f)
     with open('windows1.txt', 'r') as f:
         self.unigram_model = LaplaceUnigramModel(f)

Example #2

0

Show file

File: Translator.py Project: eperkins1/MTranslation

	def __init__(self, bigram_count_file_name, bigram_model_pickle='bigram_model.pickle'):
		self.dictionary = ourdict.dictionary
		self.tokenizer = Tokenizer()
		with open(bigram_count_file_name, 'r') as f:
			self.bigram_model = LaplaceBigramModel(f)
		with open('windows1.txt', 'r') as f:
			self.unigram_model = LaplaceUnigramModel(f)

Example #3

0

Show file

File: Translator.py Project: vvperks/124

class Translator:
    def __init__(self,
                 bigram_count_file_name,
                 bigram_model_pickle='bigram_model.pickle'):
        self.dictionary = ourdict.dictionary
        self.tokenizer = Tokenizer()
        with open(bigram_count_file_name, 'r') as f:
            self.bigram_model = LaplaceBigramModel(f)
        with open('windows1.txt', 'r') as f:
            self.unigram_model = LaplaceUnigramModel(f)

    def translate_sentence(self, sentence):
        ###################################################################################
        # Call PRE-processing rules as functions of sentence and returning sentences HERE #
        ###################################################################################
        tokens = self.tokenizer.tokenize(sentence)
        ###############################################################
        # or as functions of token list and returning toke list HERE. #
        tokens = self.remove_se(tokens)
        ###############################################################
        translated_tokens = ['^']  # Arbitrary start
        for i in range(0, len(tokens)):
            token = tokens[i].lower()
            if (token == "para"):
                translated_tokens.append(
                    self.para_process(token, tokens[i + 1]))
            else:
                translated_tokens.append(
                    self.find_next_word(token, translated_tokens))

        # for token in tokens:
        # 	token = token.lower()
        # 	translated_tokens.append(self.find_next_word(token, translated_tokens))
        #######################################################################################
        # Call POST-processing rules as functions of token list and returning token list HERE #
        #######################################################################################
        translation = self.format(translated_tokens)
        ###########################################################
        # or as functions of sentence and returning sentence HERE #
        translation = self.reverse_noun_adj([translation])  #
        ###########################################################
        return translation

    def para_process(self, para, next_word):
        if (len(next_word) > 1):
            suffix = next_word[len(next_word) - 2:]
            print "suffix: %s" % suffix
            if suffix == 'ar' or suffix == 'er' or suffix == 'ir':
                return 'to'
            else:
                return 'for'
        return 'for'

    def find_next_word(self, word, current_translation):
        candidate_words = self.dictionary[word]
        top_score = float("-inf")
        prev_word = current_translation[-1]
        if (prev_word == ',') or (prev_word == '.'):
            prev_word = current_translation[
                -2]  # If the previous token is punctuation, get what's before it
        for word in candidate_words:
            # score = self.bigram_model.score([prev_word, word])
            score = self.bigram_model.score(
                [prev_word, word]) + self.unigram_model.score([word])

            if (score > top_score):
                best = word
                top_score = score
        return best

    def format(self, token_list):
        ''' takes the list of translated words and formats it nicely for printing '''
        s = " ".join(
            token_list[1:]
        )  # Remove the leading start token and turn into a spaced string
        s = re.sub(r' ([\.,])', r'\1',
                   s)  # Remove whitespace before punctuation
        s = s[0].upper() + s[1:]  # Capitalize the sentence
        return s

    ###########################################################
    # ADD YOUR PREPROCESSING + POSTPROCESSING FUNCTIONS HERE. #
    ###########################################################

    def reverse_noun_adj(self, s):
        noun_tags = set(['NNP', 'NN', 'NNS'])
        adj_tags = set(['JJ'])
        parsed = parse.parse_english(s)[0]
        # print parsed
        words = parsed.words
        for i in range(len(words) - 1):
            if parsed.tags[i] in noun_tags:
                if parsed.tags[i + 1] in adj_tags:
                    w = words[i]
                    words[i] = words[i + 1]
                    words[i + 1] = w
                    print ">>>> SWITCHED %s and %s" % (w, words[i])
        words = ['^'] + words  # stupid hack to make the formatting work
        # print words
        s = self.format(words)
        return s

    def remove_se(self, spanish_tokens):
        new_tokens = []
        for t in spanish_tokens:
            if t != "se":
                new_tokens.append(t)
            else:
                new_tokens.append("usted")
        return new_tokens

Example #4

0

Show file

File: Translator.py Project: eperkins1/MTranslation

class Translator:
	
	def __init__(self, bigram_count_file_name, bigram_model_pickle='bigram_model.pickle'):
		self.dictionary = ourdict.dictionary
		self.tokenizer = Tokenizer()
		with open(bigram_count_file_name, 'r') as f:
			self.bigram_model = LaplaceBigramModel(f)
		with open('windows1.txt', 'r') as f:
			self.unigram_model = LaplaceUnigramModel(f)

	def translate_sentence(self, sentence):
		#PRE-processing#
		tokens = self.tokenizer.tokenize(sentence)
		tokens = self.remove_se(tokens)
		
		#Processing#
		translated_tokens = ['^'] # Arbitrary start 
		for i in range(0, len(tokens)):
			token = tokens[i].lower()
			if (token == "para"):
				translated_tokens.append(self.para_process(token, tokens[i + 1]))
			else:
				translated_tokens.append(self.find_next_word(token, translated_tokens))

		
		#POST-processing#
		translation = self.format(translated_tokens)
		return translation

	def para_process(self, para, next_word):
		if (len(next_word) > 1):
			suffix = next_word[len(next_word)-2:]
			print "suffix: %s" % suffix
			if suffix == 'ar' or suffix == 'er' or suffix == 'ir':
				return 'to'
			else:
				return 'for'
		return 'for'

	def find_next_word(self, word, current_translation):
		candidate_words = self.dictionary[word]
		top_score = float("-inf")
		prev_word = current_translation[-1]
		if (prev_word == ',') or (prev_word == '.'):
			prev_word = current_translation[-2] 	# If the previous token is punctuation, get what's before it
		for word in candidate_words:
			score = self.bigram_model.score([prev_word, word]) + self.unigram_model.score([word])

			if (score > top_score):
				best = word
				top_score = score
		return best

	def format(self, token_list):
		''' takes the list of translated words and formats it nicely for printing '''
		s = " ".join(token_list[1:])	# Remove the leading start token and turn into a spaced string
		s = re.sub(r' ([\.,])', r'\1', s)	# Remove whitespace before punctuation
		s = s[0].upper() + s[1:]		# Capitalize the sentence
		return s		


	def reverse_noun_adj(self, s):
		noun_tags = set(['NNP', 'NN', 'NNS'])
		adj_tags = set(['JJ'])
		parsed = parse.parse_english(s)[0]
		words = parsed.words
		for i in range(len(words)-1):
			if parsed.tags[i] in noun_tags:
				if parsed.tags[i+1] in adj_tags:
					w = words[i]
					words[i] = words[i+1]
					words[i+1] = w
		words = ['^'] + words # makes the formatting work
		s = self.format(words)
		return s

	def remove_se(self, spanish_tokens):
		new_tokens = []
		for t in spanish_tokens:
			if t != "se":
				new_tokens.append(t)
			else:
				new_tokens.append("usted")
		return new_tokens