Exemple #1
0
	def __init__(self, root_dir, input_text, n=4):
		Classifier.__init__(self, input_text)
		self.root_dir = root_dir
		self.language_ratios = {}
		self.n = n
		self.languages = languages
		self.tokenizer = RegexpTokenizer("[a-zA-Z'`]+")
		self.train = TrainingData(languages=self.languages, 
					  config_dir="/Users/spiridoulaoregan/Documents/oracle/python/library/configs", 
					  root_dir=root_dir)
		self._train_data()
		self.input_text = ""
		self.frequencies = dict(zip([lang for lang in languages], [{} for x in languages]))
		self._analyze_data()
Exemple #2
0
class NgramClassifier(Classifier):
	'''
	NgramClassifier Ngrams algorithm to linguistically classify text
	inherits from Classifier super class
	reference: http://blog.alejandronolla.com
	'''
	def __init__(self, root_dir, input_text, n=4):
		Classifier.__init__(self, input_text)
		self.root_dir = root_dir
		self.language_ratios = {}
		self.n = n
		self.languages = languages
		self.tokenizer = RegexpTokenizer("[a-zA-Z'`]+")
		self.train = TrainingData(languages=self.languages, 
					  config_dir="/Users/spiridoulaoregan/Documents/oracle/python/library/configs", 
					  root_dir=root_dir)
		self._train_data()
		self.input_text = ""
		self.frequencies = dict(zip([lang for lang in languages], [{} for x in languages]))
		self._analyze_data()
		

	def _train_data(self):
		self.train.build_training_set()
		self.training_data = self.train.data

	def _analyze_data(self):

		for language in self.frequencies:
			wordlist = self.train.data[language]['wordlist']

			generated_ngrams = ngrams(" ".join(wordlist), self.n, pad_left=True, pad_right=True, pad_symbol=' ')

			ngrams_list = ["".join(e.lower() for e in tpl).strip() for tpl in generated_ngrams]
			for ngram in ngrams_list:
				try:
					self.frequencies[language][ngram]+= 1
				except KeyError:
					self.frequencies[language][ngram] = 1


	def predict_language(self):
	""" Will try guessing text's language by computing Ngrams and comparing
        them against the training data. 
        "Find Minimum Distance" takes the distance measures from all of the 
        category profiles to the document profile, and picks the smallest one.
        """

		tokens = self.tokenizer.tokenize(self.input_text)
		generated_ngrams = ngrams(" ".join(["".join(e.lower() for e in tpl).strip() for tpl in tokens]),
					  4, pad_left=True, pad_right=True, pad_symbol=' ')

		
		# compare profiles with input text and each language stat
		for language in self.languages:
			distance = self.compare_ngram_distances(generated_ngrams,self.frequencies[language])
			self.language_ratios[language] = distance

		best_match = sorted(self.language_ratios.iteritems(), key=operator.itemgetter(1))
		return best_match

	def compare_ngram_distances(self, input_profile, training_profile):
		'''
		Measure how far out of place an N-gram in one profile is from its
		place in the other profile.
		'''
		document_distance = 0
		category_ngrams = [ngram[0] for ngram in training_profile] 
		document_ngrams = [ngram[0] for ngram in input_profile]
		
		max_out_order = len(document_ngrams)
		
		category_profile_index = None
		for ngram in document_ngrams:
			document_index = document_ngrams.index(ngram)
			try:
				category_profile_index = category_ngrams.index(ngram)

			except ValueError:
				category_profile_index = max_out_order

			distance = abs( (category_profile_index - document_index) )
			document_distance += distance

		return document_distance
	
	def classify(self):
		pass