Exemple #1
0
 def raw_words(self, length=100):
     """Generates a list of words using an NLTK NgramModel."""
     if not hasattr(self, '_ngram_model'):
         estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
         self._ngram_model = NgramModel(2, self.model, estimator=estimator)
     return self._ngram_model.generate(length,
                                       [random.choice(self.words)])[1:]
Exemple #2
0
def generateContentFromTokens(text_length, ngram_length, token_list):
    estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1)
    source_ngrams = NgramModel(ngram_length,token_list,estimator)

    seed_words = source_ngrams.generate(text_length)[-2:] 
    generated_text = source_ngrams.generate(text_length, seed_words)
    return ' '.join(generated_text)
Exemple #3
0
def dos2(words):
	#bigram model. there must be a way to avoid precomputing this?
	#this is not working yet as I haven't succeeded in getting nltk's
	#smoothing implemention's to work.
	#also, a better way would be to systematically try all possibilities
	#and store the ones with a probability above a threshold
	#(there must be an algorithm for doing this efficiently)
	model = NgramModel(2, words)
	lengths = map(len, words)
	# iterate over possible number of morphemes
	for n in range(2, max(lengths)+1):
		# sample as many words as there are words with this number of morphemes
		for m in lengths.count(n):
			yield model.generate(n)
Exemple #4
0
 def __init__(self, dataset, capitalize=False):
     self.capitalize = capitalize
     tweets = dataset.split("\n")
     words = []
     for tweet in tweets:
         if "@" in tweet or tweet.startswith("RT"):
             continue
         words += [
             word for word in tweet.split()
             if word[0] not in ["@", "#", ":", "(", ")", "2"]
             and not "http://" in word and not "https://" in word
         ]
     self.words = words
     self.model = nltk.Text(words)
     estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
     self._ngram_model = NgramModel(2, self.model, estimator=estimator)
Exemple #5
0
def generate(body_tokens):
    #return a 5 line string object following the Cinquain syllable pattern.
    #stores the pattern rule for the Cinquain
    #this could be parameterized to handle other formats
    syl_per_line = [5,7,5,7,7]
    line_syl_counts=[0]*len(syl_per_line)
    lines=[""]*len(syl_per_line)

    #much like the generator code for random article text
    estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1)
    source = NgramModel(min(syl_per_line) ,body_tokens, estimator)
    seed_words = source.generate(100)[-2:] 
    generated_text = source.generate(sum(syl_per_line)*2, seed_words)
    

  
    for i in range(len(syl_per_line)):
        target = syl_per_line[i]
        while True:
            word = generated_text[0]
            s = syll_count(word)            
            if (s + line_syl_counts[i] < target):
                line_syl_counts[i] += syll_count(word)
                lines[i] += word + " "
                word = generated_text.pop(0)
            elif (s + line_syl_counts[i] == target):
                line_syl_counts[i] += syll_count(word)
                lines[i] += word + " "
                word = generated_text.pop(0)
                break
            else:
                word = generated_text.pop(0)
                break
            
            

    for i, text in enumerate(lines):
        if line_syl_counts[i] < syl_per_line[i]:
            target = syl_per_line[i] - line_syl_counts[i]
            for word in (generated_text):
                if syll_count(word) == target:
                    text += word 
                    break  

           
    return "\n".join(lines)
Exemple #6
0
 def __init__(self, dataset, capitalize=False):
     self.capitalize = capitalize
     tweets = dataset.split("\n")
     words = []
     for tweet in tweets:
         if "@" in tweet or tweet.startswith("RT"):
             continue
         words += [word for word in tweet.split() if word[0] not in ["@", "#"] and not "http://" in word and not "https://" in word]
     self.words = words
     self.model = nltk.Text(words)
     estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
     self._ngram_model = NgramModel(2, self.model, estimator=estimator)
Exemple #7
0
def language_ngrams_tags(n, training):
    language_ngrams = {}
    languages = {}
    for language in LANGUAGES:
        languages[language] = []
    for comment, language in training:
        tags_of_a_comment = [
            tag for statement in comment for word, tag in statement
        ]
        languages[language].extend(tags_of_a_comment)
    for language in LANGUAGES:
        language_ngrams[language] = NgramModel(n, languages[language],
                                               _estimator)
    return language_ngrams
Exemple #8
0
class Generator:
    def __init__(self, dataset, capitalize=False):
        self.capitalize = capitalize
        tweets = dataset.split("\n")
        words = []
        for tweet in tweets:
            if "@" in tweet or tweet.startswith("RT"):
                continue
            words += [word for word in tweet.split() if word[0] not in ["@", "#"] and not "http://" in word and not "https://" in word]
        self.words = words
        self.model = nltk.Text(words)
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        self._ngram_model = NgramModel(2, self.model, estimator=estimator)

    def raw_words(self, length=100):
        """Generates a list of words using an NLTK NgramModel."""
        return self._ngram_model.generate(length, [random.choice(self.words)])[1:]

    def smart_trim(self, genwords):
        """Trims to tweet-size while attempting to respect sentence boundaries."""
        new_words = genwords[:]

        # Cleverly trim to tweet size
        stoppers = r'[.?!]'
        while True:
            short_enough = (sum([len(word)+1 for word in new_words]) < 140)
            if short_enough and re.search(stoppers, new_words[-1]):
                break
            if len(new_words) <= 1:
                new_words = genwords[:]
                break
            new_words.pop()

        # Proper sentence markings
        for i, word in enumerate(new_words):
            if i == 0  or re.search(stoppers, new_words[i-1][-1]):
                new_words[i] = word.capitalize()

        return new_words

    def tweetworthy(self):
        """Generate some tweetable text."""
        genwords = self.raw_words()

        if self.capitalize:
            genwords = self.smart_trim(genwords)

        while len(genwords) > 1 and sum([len(word)+1 for word in genwords]) > 140:
            genwords.pop()
            if self.capitalize:
                genwords[-1] += random.choice(['.', '!', '?'])

        product = " ".join(genwords)
        if len(product) > 140: product = product[0:140]

        # Remove mismatched enclosures
        for pair in [['(', ')'], ['{', '}'], ['[', ']']]:
            if product.count(pair[0]) != product.count(pair[1]):
                product = product.replace(pair[0], '').replace(pair[1], '')

        for enc in ['"', '*']:
            if product.count(enc) % 2 != 0:
                product = product.replace(enc, '')

        return product
 def raw_words(self, length=100):
     """Generates a list of words using an NLTK NgramModel."""
     if not hasattr(self, '_ngram_model'):
         estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
         self._ngram_model = NgramModel(2, self.model, estimator)
     return self._ngram_model.generate(length, [random.choice(self.words)])[1:]
Exemple #10
0
class Generator:
    def __init__(self):
        dataset = open(os.path.join(TOPDIR, 'dataset.txt')).read()
        words = [
            word for word in dataset.split()
            if re.match(r'[a-zA-Z0-9 \.,?:\'"!_\(\)]+', word)
        ]
        self.words = words
        self.model = nltk.Text(words)

    def raw_words(self, length=100):
        """Generates a list of words using an NLTK NgramModel."""
        if not hasattr(self, '_ngram_model'):
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._ngram_model = NgramModel(2, self.model, estimator=estimator)
        return self._ngram_model.generate(length,
                                          [random.choice(self.words)])[1:]

    def smart_trim(self, genwords):
        """Trims to tweet-size while attempting to respect sentence boundaries."""
        new_words = genwords[:]

        # Cleverly trim to tweet size
        stoppers = r'[.?!]'
        while True:
            short_enough = (sum([len(word) + 1 for word in new_words]) < 140)
            if short_enough and re.search(stoppers, new_words[-1]):
                break
            if len(new_words) <= 1:
                new_words = genwords[:]
                break
            new_words.pop()

        # Proper sentence markings
        for i, word in enumerate(new_words):
            if i == 0 or re.search(stoppers, new_words[i - 1][-1]):
                new_words[i] = word.capitalize()

        return new_words

    def tweetworthy(self):
        """Generate some tweetable text."""
        genwords = self.raw_words()

        genwords = self.smart_trim(genwords)

        while len(genwords) > 1 and sum([len(word) + 1
                                         for word in genwords]) > 140:
            genwords.pop()

        # genwords[-1] += random.choice(['.', '!', '?'])

        product = " ".join(genwords)
        if len(product) > 140: product = product[0:140]

        # Remove mismatched enclosures
        for pair in [['(', ')'], ['{', '}'], ['[', ']']]:
            if product.count(pair[0]) != product.count(pair[1]):
                product = product.replace(pair[0], '').replace(pair[1], '')

        for enc in ['"', '*']:
            if product.count(enc) % 2 != 0:
                product = product.replace(enc, '')

        return product
Exemple #11
0
class Generator:
    def __init__(self, dataset, capitalize=False):
        self.capitalize = capitalize
        tweets = dataset.split("\n")
        words = []
        for tweet in tweets:
            if "@" in tweet or tweet.startswith("RT"):
                continue
            words += [
                word for word in tweet.split()
                if word[0] not in ["@", "#", ":", "(", ")", "2"]
                and not "http://" in word and not "https://" in word
            ]
        self.words = words
        self.model = nltk.Text(words)
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        self._ngram_model = NgramModel(2, self.model, estimator=estimator)

    def raw_words(self, length=100):
        """Generates a list of words using an NLTK NgramModel."""
        return self._ngram_model.generate(length,
                                          [random.choice(self.words)])[1:]

    def smart_trim(self, genwords):
        """Trims to tweet-size while attempting to respect sentence boundaries."""
        new_words = genwords[:]

        # Cleverly trim to tweet size
        stoppers = r'[.?!]'
        while True:
            short_enough = (sum([len(word) + 1 for word in new_words]) < 140)
            if short_enough and re.search(stoppers, new_words[-1]):
                break
            if len(new_words) <= 1:
                new_words = genwords[:]
                break
            new_words.pop()

        # Proper sentence markings
        for i, word in enumerate(new_words):
            if i == 0 or re.search(stoppers, new_words[i - 1][-1]):
                new_words[i] = word.capitalize()

        return new_words

    def tweetworthy(self):
        """Generate some tweetable text."""
        genwords = self.raw_words()

        if self.capitalize:
            genwords = self.smart_trim(genwords)

        while len(genwords) > 1 and sum([len(word) + 1
                                         for word in genwords]) > 140:
            genwords.pop()
            if self.capitalize:
                genwords[-1] += random.choice(['.', '!', '?'])

        product = " ".join(genwords)
        if len(product) > 140: product = product[0:140]

        # Remove mismatched enclosures
        for pair in [['(', ')'], ['{', '}'], ['[', ']']]:
            if product.count(pair[0]) != product.count(pair[1]):
                product = product.replace(pair[0], '').replace(pair[1], '')

        for enc in ['"', '*']:
            if product.count(enc) % 2 != 0:
                product = product.replace(enc, '')

        return product
Exemple #12
0
def markov(n, tokenized_content):
    content_model = NgramModel(n, tokenized_content)
    generated_content = content_model.generate(50, 'Emma')
    return ' '.join(generated_content)