-
Notifications
You must be signed in to change notification settings - Fork 0
/
generator.py
55 lines (34 loc) · 1.34 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
My first python program.
Structured to allow me to explore the language features and learn.
"""
import json
import getopt
import nltk
import sys
from nltk import NgramModel
from nltk.probability import WittenBellProbDist
#generates random text based on ngram analysis of the parsed data
#returned text is not a list - it is in paragraph format
def generateContentFromTokens(text_length, ngram_length, token_list):
estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1)
source_ngrams = NgramModel(ngram_length,token_list,estimator)
seed_words = source_ngrams.generate(text_length)[-2:]
generated_text = source_ngrams.generate(text_length, seed_words)
return ' '.join(generated_text)
if __name__ == '__main__':
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = 'foxNewsInputSet.json'
#constants
GENERATED_TEXT_LENGTH = 400
GENERATED_TITLE_LENGTH = 10
NGRAM_LENGTH = 5
#read the pre-processed tokens back from json serialization
with open(filename, 'r') as fp:
tokens = json.load(fp)
body_tokens = tokens["body_tokens"]
title_tokens = tokens["title_tokens"]
print generateContentFromTokens(GENERATED_TITLE_LENGTH, NGRAM_LENGTH, title_tokens) +'\n'
print generateContentFromTokens(GENERATED_TEXT_LENGTH, NGRAM_LENGTH, body_tokens)