/
nanogenmo.py
130 lines (102 loc) · 3.57 KB
/
nanogenmo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from collections import Counter
import random
import itertools
import datetime
from nltk.corpus import gutenberg
import spacy
import markovify
def doc_to_text(doc):
"""transforms text to be more mergable
1. replaces names and proper nouns
2. deduplicates consecutive duplicate words
"""
text_parts = []
for tok in doc:
if tok.tag_ == 'NNP':
new_part = 'someone' + tok.whitespace_
text_parts.extend(new_part)
elif tok.tag_ == 'NNPS':
new_part = 'they' + tok.whitespace_
text_parts.extend(new_part)
elif tok.tag_ == 'PRP':
new_part = 'they' + tok.whitespace_
text_parts.extend(new_part)
elif tok.tag_ == 'PRP$':
new_part = 'their' + tok.whitespace_
text_parts.extend(new_part)
else:
new_part = tok.text_with_ws
text_parts.extend(new_part)
anon_text = ''.join(text_parts)
split_words = anon_text.split(' ')
no_consec_duplicates = [i[0] for i in itertools.groupby(split_words)]
output_text = ' '.join(no_consec_duplicates)
return(output_text)
sentence_target = 3500
nlp = spacy.load("en_core_web_lg")
nltk_gutenberg_text_names = [
'austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt'
]
data = [{'name': name, 'raw': gutenberg.raw(name)} for name in nltk_gutenberg_text_names]
# parse each text document with spacy
for record in data:
doc = nlp(record['raw'])
record.update(dict(doc=doc))
# create an alternate text version without pronouns or propernouns
for record in data:
doc = record['doc']
anon_text = doc_to_text(doc)
record.update(dict(anon_text=anon_text))
for record in data:
doc = record['doc']
sents = list(doc.sents)
sent_texts = [doc_to_text(sent) for sent in sents]
single_sentence_models = []
for sent_text in sent_texts:
try:
model = markovify.Text(sent_text, state_size=2)
single_sentence_models.append(model)
except:
pass
record['single_sentence_models'] = single_sentence_models
outputs = []
max_len = max([len(record['single_sentence_models']) for record in data])
weights = [len(record['single_sentence_models'])/max_len for record in data]
for i in range(sentence_target):
progress = i/sentence_target
end_window_norm = (i+50)/sentence_target
book_models = []
for record in data:
sentence_count = len(record['single_sentence_models'])
start = int(progress*sentence_count)
end = int(end_window_norm*sentence_count)
end = end if end > start else start+1
combined_model = markovify.combine(record['single_sentence_models'][start:end])
book_models.append(combined_model)
multi_model = markovify.combine(book_models, weights)
new_sent = multi_model.make_sentence(tries=1000)
if new_sent:
outputs.append(new_sent)
output_text = ' '.join(outputs)
timestamp = str(int(datetime.datetime.now().timestamp()))
filename = "novel_" + timestamp + ".txt"
with open(filename, "w") as text_file:
text_file.write(output_text)
print('words', len(output_text.split(' ')))
print('sentences', len(outputs))
print('filename', filename)