-
Notifications
You must be signed in to change notification settings - Fork 0
/
cinquain.py
108 lines (77 loc) · 3.2 KB
/
cinquain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
generates a Cinquain from the stored token data.
cinquain is a poetic form with a 5 line pattern
the form I am using here is Tanka - an unrhymed syllable pattern of 5-7-5-7-7
"""
import curses, nltk, re, sys, json
from curses.ascii import isdigit
from nltk import NgramModel
from nltk.corpus import cmudict
from nltk.probability import WittenBellProbDist
dictionary= cmudict.dict()
#utilizes CMU Pronunciation Dictionary to count syllables by counting those marked for stress with an integer
#returns the minimum syllable count to accomodate words with multiple pronunciations.
def syll_count(word):
word = word.lower()
try:
return min([len(list(c for c in phoneme if isdigit(c[-1]))) for phoneme in dictionary[word]])
except KeyError:
#word not found in CMU dict...
#time to roll our own way of counting syllables
#first try splitting word by vowels and counting
array = re.split("[^aeiouy]+", word)
for i, x in enumerate(array):
if x == '':
del array[i]
count = len(array)
#remove (likely) silent 'e' from the syllable count
if word[-1] == 'e' and count > 1:
count -= 1
return count
def generate(body_tokens):
#return a 5 line string object following the Cinquain syllable pattern.
#stores the pattern rule for the Cinquain
#this could be parameterized to handle other formats
syl_per_line = [5,7,5,7,7]
line_syl_counts=[0]*len(syl_per_line)
lines=[""]*len(syl_per_line)
#much like the generator code for random article text
estimator = lambda fdist, bins: WittenBellProbDist(fdist,len(fdist)+1)
source = NgramModel(min(syl_per_line) ,body_tokens, estimator)
seed_words = source.generate(100)[-2:]
generated_text = source.generate(sum(syl_per_line)*2, seed_words)
for i in range(len(syl_per_line)):
target = syl_per_line[i]
while True:
word = generated_text[0]
s = syll_count(word)
if (s + line_syl_counts[i] < target):
line_syl_counts[i] += syll_count(word)
lines[i] += word + " "
word = generated_text.pop(0)
elif (s + line_syl_counts[i] == target):
line_syl_counts[i] += syll_count(word)
lines[i] += word + " "
word = generated_text.pop(0)
break
else:
word = generated_text.pop(0)
break
for i, text in enumerate(lines):
if line_syl_counts[i] < syl_per_line[i]:
target = syl_per_line[i] - line_syl_counts[i]
for word in (generated_text):
if syll_count(word) == target:
text += word
break
return "\n".join(lines)
if __name__ == '__main__':
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = 'foxNewsInputSet.json'
#read the pre-processed tokens back from json serialization
with open(filename, 'r') as fp:
tokens = json.load(fp)
body_tokens = tokens["body_tokens"]
print generate(body_tokens)