/
cookie.py
145 lines (109 loc) · 5.22 KB
/
cookie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import markovify
import re
import nltk
import os
import urllib.request
from shutil import copyfile
# We need a temporary(ish) place to store the data we retrieve.
# If you are running this in a docker container you may want to mount a volume and use it.
# Also be sure to make a symlink between it and the assets directory. See our dockerfile for an example!
datadir = "./web/assets/data"
if 'DATA_DIR' in os.environ:
datadir = os.environ['DATA_DIR']
if not os.path.exists(datadir):
os.mkdir(datadir)
# Basically the example from the markovify documentation that uses parts of speech and stuff to make better sentences
class POSifiedText(markovify.Text):
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
return words
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
# Grab a list of fortunes from Github
if not os.path.exists(datadir+"/cookie.txt"):
urllib.request.urlretrieve("https://raw.githubusercontent.com/ianli/fortune-cookies-galore/master/fortunes.txt", datadir+"/cookie.txt")
# Grab the US constitution raw text
if not os.path.exists(datadir+'/const.txt'):
urllib.request.urlretrieve("https://www.usconstitution.net/const.txt", datadir+"/const.txt")
if not os.path.exists(datadir+'/tweeter.txt'):
urllib.request.urlretrieve("https://raw.githubusercontent.com/ElDeveloper/tweets/master/tweets_text.txt", datadir+"/tweeter.txt")
# Read both files into variables
with open(datadir+"/cookie.txt") as f:
text = f.read()
with open(datadir+'/const.txt') as f:
tswext = f.read()
with open(datadir+"/tweeter.txt") as f:
tweetext = f.read()
# Break up the text to make it more workable
cookie_text_split = text.split("\n")
const_text_split = tswext.split("\n")
tweet_text_split = tweetext.split("\n")
# Some cleanup to remove things in the fortune cookie file that aren't really fortunes.
# (There are some odd facts and quotes in here. This is a bit barbaric, but this is a fun project anyway! No need for perfection...)
def excluded(string):
if string.startswith("Q:"):
return False
if "\"" in string:
return False
if "--" in string:
return False
return True
# Same thing for the constitution text - this just removes the comment at the top.
def exwifted(string):
if "[" in string:
return False
return True
# Apply the cleanups from above
cookie_text_split[:] = [x for x in cookie_text_split if excluded(x)]
const_text_split[:] = [x for x in const_text_split if exwifted(x)]
# Merge the text back into one big blob like markovify expects. (There's probably a better way to do this, but again, fun project. Efficiency's not that important...
cookie_text_model = POSifiedText("\n".join(cookie_text_split))
const_text_model = POSifiedText("\n".join(const_text_split))
tweet_text_model = POSifiedText("\n".join(tweet_text_split))
# Combine them into a terrifying structure
const_and_cookie_model = markovify.combine([cookie_text_model, const_text_model])
tweet_and_cookie_model = markovify.combine([cookie_text_model, tweet_text_model], [4, 1])
everything_model = markovify.combine([cookie_text_model, const_text_model, tweet_text_model], [4, 1, 1])
# Print a couple lines to the terminal to show that everything's working...
print("Examples:")
for i in range(5):
print(const_and_cookie_model.make_short_sentence(240, tries=25))
# Now, open a temporary file and write some javascript surrounding our story.
with open(datadir+"/cookie.js.new", "w+") as file:
# NOTE: I don't escape anything here... with bad seed text it'd be quite possible to inject weird js, etc.
file.write("window.fortuneCookies=[\n")
print("Running cookie")
# Write 100 lines of junk into the js file. Note that leaving the closing comma is ok, as javascript doesn't care.
for i in range(250):
file.write("\"" + cookie_text_model.make_short_sentence(240, tries=25) + "\",\n")
# Close it up!
file.write("];")
print("Running const + cookie")
file.write("window.constCookies=[\n")
for i in range(250):
file.write("\"" + const_and_cookie_model.make_short_sentence(240, tries=25) + "\",\n")
file.write("];")
print("Running const only")
file.write("window.constLines=[\n")
for i in range(250):
file.write("\"" + const_text_model.make_short_sentence(240, tries=25) + "\",\n")
file.write("];")
print("Running tweet only")
file.write("window.tweetLines=[\n")
for i in range(250):
file.write("\"" + tweet_text_model.make_short_sentence(240, tries=25) + "\",\n")
file.write("];")
print("Running tweet cookie")
file.write("window.tweetCookie=[\n")
for i in range(250):
file.write("\"" + tweet_and_cookie_model.make_short_sentence(240, tries=25) + "\",\n")
file.write("];")
print("Running everything")
file.write("window.everythingCookie=[\n")
for i in range(250):
file.write("\"" + everything_model.make_short_sentence(240, tries=25) + "\",\n")
file.write("];")
# Finally, copy our temp file over the old one, so clients can start seeing it.
copyfile(datadir+"/cookie.js.new", datadir+"/cookie.js")