def detect(self, text): ''' Detect the text's language ''' words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) # special case # if all scores are 0.0 we return None s = 0.0 for score in scores.itervalues(): s += score if s == 0.0: return None return l_map[sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]]
def detect(self, text): ''' Detect the text's language ''' words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total)) # special case # if all scores are 0.0 we return None s = 0.0 for score in scores.itervalues(): s += score if s == 0.0: return None return l_map[ sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] ]
def preprocess_text_nltk(self, text): sentences = self.sent_tokenize(text) sentences_cleaned = [] for sent in sentences: if self.stopwords_remove: self.stopword_remover.replace_keywords(sent) words = nltk_word_tokenize(sent, self.language) words = [w for w in words if w not in string.punctuation] words = [w for w in words if w not in self.extra_stopwords] words = [w.lower() for w in words] sentences_cleaned.append(" ".join(words)) return sentences_cleaned
def word_tokenize(sentence, pt_tokenizer=True): """Tokenize sentence into words. NOTE: Default is `TreebankWordTokenizer`. If pt_tokenizer, use `TreebankWordTokenizer`, don't work well for "can't" TODO(zcq) """ if pt_tokenizer: return nltk_word_tokenize(sentence) else: return nltk_WP_tokenize(sentence)
def word_tokenize(sentence, tokenizer="nltk"): """ Tokenize the input string. Args: sentence (string): The input string tokenizer (string): The tokenizer to use. Default is nltk word tokenizer Returns: List[string]: The tokens from the input string """ if tokenizer in "nltk": return nltk_word_tokenize(sentence) else: return re.split(SENTENCE_SPLIT_REGEX, sentence)
def detect(self, text): ''' Detect the text's language ''' #print "Detect: " + text try: self.__mutex.acquire() if not text: raise ValueError(u"Text: " + unicode(text)) text = unicodedata.normalize("NFC", text) words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: word_trigrams = self.__get_word_trigrams(match) #print "Match: " + match #print "trigrams: " + str(word_trigrams) for trigram in word_trigrams: if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score try: scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total)) except ZeroDivisionError as e: logger.error(u"Div: " + unicode(float(frequencies.N())) + u" " + unicode(float(total))) raise e sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) #print sorted_scores #logger.info(u"%s: %s" % (text, unicode(sorted_scores))) for lang, score in sorted_scores: if score > 0.0001: return lang return None finally: self.__mutex.release()
def preprocess_text_nltk(text): stopwords_remove = True language = 'english' sentences = sent_tokenize(text) extra_stopwords = ["''", "``", "'s"] sentences_cleaned = [] if stopwords_remove: stopword_remover = flashtext.KeywordProcessor() for stopword in stopwords.words(language): stopword_remover.add_keyword(stopword, '') stopword_remover = stopword_remover for sent in sentences: if stopwords_remove: stopword_remover.replace_keywords(sent) words = nltk_word_tokenize(sent, language) words = [w for w in words if w not in string.punctuation] words = [w for w in words if w not in extra_stopwords] words = [w.lower() for w in words] sentences_cleaned.append(" ".join(words)) return sentences_cleaned
def detect(self, text): words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)] if len(trigcount) > 0: trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"]) trigrams = trigdf.groupby("key")["value"].sum().to_dict() else: trigrams = {} total = sum(trigrams.values()) maxscore, maxid = 0, "" for trigram, count in trigrams.items(): trishare = (float(count) / float(total)) for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()): scores[lang] += frequencies[trigram] * trishare if scores[lang] > maxscore: maxid, maxscore = lang, scores[lang]
def mapper(self, key, tweet): ''' Detect the text's language ''' obj = cjson.decode(tweet) text = obj['tx'] words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total)) obj['lang'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] yield key, obj
def detect(self, text): #tokenize the words words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) #get the trigrams and insert count of trigrams in a list for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) #normalie the frequency and sort according to the keys. for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total)) return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
def mapper(self, key, tweet): ''' Detect the text's language ''' obj = cjson.decode(tweet) text = obj['tx'] words = nltk_word_tokenize(text.lower()) trigrams = {} scores = dict([(lang, 0) for lang in self.language_trigrams.keys()]) for match in words: for trigram in self.get_word_trigrams(match): if not trigram in trigrams.keys(): trigrams[trigram] = 0 trigrams[trigram] += 1 total = sum(trigrams.values()) for trigram, count in trigrams.items(): for lang, frequencies in self.language_trigrams.items(): # normalize and add to the total score scores[lang] += (float(frequencies[trigram]) / float( frequencies.N())) * (float(count) / float(total)) obj['lang'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] yield key, obj