def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" if not hasattr(self, '_ngram_model'): estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) return self._ngram_model.generate(length, [random.choice(self.words)])[1:]
def language_ngrams_tags(n, training): language_ngrams = {} languages = {} for language in LANGUAGES: languages[language] = [] for comment, language in training: tags_of_a_comment = [ tag for statement in comment for word, tag in statement ] languages[language].extend(tags_of_a_comment) for language in LANGUAGES: language_ngrams[language] = NgramModel(n, languages[language], _estimator) return language_ngrams
def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [ word for word in tweet.split() if word[0] not in ["@", "#", ":", "(", ")", "2"] and not "http://" in word and not "https://" in word ] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator)