Exemple #1
0
 def raw_words(self, length=100):
     """Generates a list of words using an NLTK NgramModel."""
     if not hasattr(self, '_ngram_model'):
         estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
         self._ngram_model = NgramModel(2, self.model, estimator=estimator)
     return self._ngram_model.generate(length,
                                       [random.choice(self.words)])[1:]
Exemple #2
0
def language_ngrams_tags(n, training):
    language_ngrams = {}
    languages = {}
    for language in LANGUAGES:
        languages[language] = []
    for comment, language in training:
        tags_of_a_comment = [
            tag for statement in comment for word, tag in statement
        ]
        languages[language].extend(tags_of_a_comment)
    for language in LANGUAGES:
        language_ngrams[language] = NgramModel(n, languages[language],
                                               _estimator)
    return language_ngrams
Exemple #3
0
 def __init__(self, dataset, capitalize=False):
     self.capitalize = capitalize
     tweets = dataset.split("\n")
     words = []
     for tweet in tweets:
         if "@" in tweet or tweet.startswith("RT"):
             continue
         words += [
             word for word in tweet.split()
             if word[0] not in ["@", "#", ":", "(", ")", "2"]
             and not "http://" in word and not "https://" in word
         ]
     self.words = words
     self.model = nltk.Text(words)
     estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
     self._ngram_model = NgramModel(2, self.model, estimator=estimator)