def __init__(self, root_dir, input_text, n=4): Classifier.__init__(self, input_text) self.root_dir = root_dir self.language_ratios = {} self.n = n self.languages = languages self.tokenizer = RegexpTokenizer("[a-zA-Z'`]+") self.train = TrainingData(languages=self.languages, config_dir="/Users/spiridoulaoregan/Documents/oracle/python/library/configs", root_dir=root_dir) self._train_data() self.input_text = "" self.frequencies = dict(zip([lang for lang in languages], [{} for x in languages])) self._analyze_data()
class NgramClassifier(Classifier): ''' NgramClassifier Ngrams algorithm to linguistically classify text inherits from Classifier super class reference: http://blog.alejandronolla.com ''' def __init__(self, root_dir, input_text, n=4): Classifier.__init__(self, input_text) self.root_dir = root_dir self.language_ratios = {} self.n = n self.languages = languages self.tokenizer = RegexpTokenizer("[a-zA-Z'`]+") self.train = TrainingData(languages=self.languages, config_dir="/Users/spiridoulaoregan/Documents/oracle/python/library/configs", root_dir=root_dir) self._train_data() self.input_text = "" self.frequencies = dict(zip([lang for lang in languages], [{} for x in languages])) self._analyze_data() def _train_data(self): self.train.build_training_set() self.training_data = self.train.data def _analyze_data(self): for language in self.frequencies: wordlist = self.train.data[language]['wordlist'] generated_ngrams = ngrams(" ".join(wordlist), self.n, pad_left=True, pad_right=True, pad_symbol=' ') ngrams_list = ["".join(e.lower() for e in tpl).strip() for tpl in generated_ngrams] for ngram in ngrams_list: try: self.frequencies[language][ngram]+= 1 except KeyError: self.frequencies[language][ngram] = 1 def predict_language(self): """ Will try guessing text's language by computing Ngrams and comparing them against the training data. "Find Minimum Distance" takes the distance measures from all of the category profiles to the document profile, and picks the smallest one. """ tokens = self.tokenizer.tokenize(self.input_text) generated_ngrams = ngrams(" ".join(["".join(e.lower() for e in tpl).strip() for tpl in tokens]), 4, pad_left=True, pad_right=True, pad_symbol=' ') # compare profiles with input text and each language stat for language in self.languages: distance = self.compare_ngram_distances(generated_ngrams,self.frequencies[language]) self.language_ratios[language] = distance best_match = sorted(self.language_ratios.iteritems(), key=operator.itemgetter(1)) return best_match def compare_ngram_distances(self, input_profile, training_profile): ''' Measure how far out of place an N-gram in one profile is from its place in the other profile. ''' document_distance = 0 category_ngrams = [ngram[0] for ngram in training_profile] document_ngrams = [ngram[0] for ngram in input_profile] max_out_order = len(document_ngrams) category_profile_index = None for ngram in document_ngrams: document_index = document_ngrams.index(ngram) try: category_profile_index = category_ngrams.index(ngram) except ValueError: category_profile_index = max_out_order distance = abs( (category_profile_index - document_index) ) document_distance += distance return document_distance def classify(self): pass