-
Notifications
You must be signed in to change notification settings - Fork 4
/
emma.py
443 lines (381 loc) · 17.5 KB
/
emma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# -*- coding: utf-8 -*-
import random
import pickle
import logging
import os
import re
import cgi
import time
import pattern.en
import pattern.vector
import sqlite3 as sql
from mastodon import Mastodon, StreamListener
import flags
import pronouns
import wordpatternfinder
import associationtrainer
import replybuilder
import misc
import apikeys
# Setup stuff
# Set up logging level (this should go in misc.py but eh)
logging.root.setLevel(logging.INFO)
# Pre-flight engine checks
# Check for emma.db or create it if it isn't there
logging.info("Checking for association model")
if os.path.isfile('emma.db'):
logging.debug("Association model found!")
else:
logging.warn("Association model not found! Creating")
with sql.connect('emma.db') as connection:
connection.cursor().executescript("""
DROP TABLE IF EXISTS associationmodel;
DROP TABLE IF EXISTS dictionary;
CREATE TABLE associationmodel(word TEXT, association_type TEXT, target TEXT, weight DOUBLE);
CREATE TABLE dictionary(word TEXT, part_of_speech TEXT, affinity DOUBLE)
""")
logging.debug("Association model created.")
# Set up SQL
connection = sql.connect('emma.db')
connection.text_factory = str
cursor = connection.cursor()
# Dumb chrome
print u"\n .ooooo. ooo. .oo. .oo. ooo. .oo. .oo. .oooo.\nd88' \u006088b \u0060888P\"Y88bP\"Y88b \u0060888P\"Y88bP\"Y88b \u0060P )88b\n888ooo888 888 888 888 888 888 888 .oP\"888\n888 ., 888 888 888 888 888 888 d8( 888\n\u0060Y8bod8P' o888o o888o o888o o888o o888o o888o \u0060Y888\"\"8o\n\n ELECTRONIC MODEL of MAPPED ASSOCIATIONS\n Version " + misc.versionNumber + "\n"
with connection:
cursor.execute("SELECT * FROM associationmodel")
associationModelItems = "{:,d}".format(len(cursor.fetchall()))
cursor.execute("SELECT * FROM dictionary")
dictionaryItems = "{:,d}".format(len(cursor.fetchall()))
print "Database contains {0} associations for {1} words.".format(associationModelItems, dictionaryItems)
# Check for and load the file containing the history of mood values or create it if it isn't there
logging.info("Loading mood history...")
if os.path.isfile('moodHistory.p'):
logging.debug("Mood history found!")
with open('moodHistory.p','rb') as moodFile: moodHistory = pickle.load(moodFile)
logging.debug("Mood history loaded!")
else:
logging.warn("Mood history file not found! Creating...")
with open('moodHistory.p','wb') as moodFile:
moodHistory = [0.0] * 10
pickle.dump(moodHistory, moodFile)
logging.debug("Mood history file created.")
# Mood-related things
def add_mood_value(text):
"""Adds the new mood value to the front of the history list and removes the last one"""
moodValue = pattern.en.sentiment(text)[0]
logging.debug("Adding mood value {0} to mood history {1}...".format(moodValue, moodHistory))
moodHistory.insert(0, moodValue)
del moodHistory[-1]
logging.debug("New mood history is {0}".format(moodHistory))
# And save!
logging.debug("Saving mood history...")
with open('moodHistory.p', 'wb') as moodFile:
pickle.dump(moodHistory, moodFile)
return moodValue
def calculate_mood():
"""Mood is calculated with a weighted mean average formula, skewed towards more recent moods"""
logging.debug("Calculating mood...")
# First, we calculate the weighted mood history
weightedMoodHistory = []
weightedMoodHistory.extend([moodHistory[0], moodHistory[0], moodHistory[0], moodHistory[1], moodHistory[1]])
weightedMoodHistory.extend(moodHistory[2:9])
# And take the average to get the mood
mood = sum(weightedMoodHistory) / 13
logging.debug("Mood: {0}".format(mood))
return mood
def express_mood(moodValue):
"""Returns a string which can be attached to a post as a tag expressing Emma's mood"""
logging.debug("Expressing mood...")
if -0.8 > moodValue:
return u"feeling abysmal \ud83d\ude31"
elif -0.6 > moodValue >= -0.8:
return u"feeling dreadful \ud83d\ude16"
elif -0.4 > moodValue >= -0.6:
return u"feeling bad \ud83d\ude23"
elif -0.2 > moodValue >= -0.4:
return u"feeling crummy \ud83d\ude41"
elif 0.0 > moodValue >= -0.2:
return u"feeling blah \ud83d\ude15"
elif 0.2 > moodValue >= 0.0:
return u"feeling alright \ud83d\ude10"
elif 0.4 > moodValue >= 0.2:
return u"feeling good \ud83d\ude42"
elif 0.6 > moodValue >= 0.4:
return u"feeling great \ud83d\ude09"
elif 0.8 > moodValue >= 0.6:
return u"feeling fantastic \ud83d\ude00"
elif moodValue >= 0.8:
return u"feeling glorious \ud83d\ude1c"
# Preparing our datatypes
# Let's start by defining some classes for NLU stuff:
class Word:
"""
Defines a word and its attributes
Class variables:
word str String representation of the Word
lemma str String representation of the root form of the Word
partOfSpeech str Penn Treebank II part-of-speech tag
chunk str Part of the Sentence (noun-phrase, verb-phrase, etc.)
subjectObject str If the Word is a noun, this indicates whether it is the subject or object of the Sentence
index int The word's position in the sentence (0-indexed)
"""
def __init__(self, word, index):
self.word = word[0]
self.lemma = word[5]
self.partOfSpeech = word[1]
self.chunk = word[2]
self.subjectObject = word[4]
self.index = index
def __str__(self):
return self.word
class Sentence:
"""
Defines a sentence and its attributes, auto-generates and fills itself with Word objects
Class variables:
sentence str String representation of the Sentence
words list Ordered list of Word objects in the Sentence
mood float Positive or negative sentiment in the Sentence
length int Length of the sentence
domain str The sentence's domain as determined by the wordpatternfinder module
interrogativePackage InterrogativePackage If the sentence domain is INTERROGATIVE, this represents the question that they're asking
"""
def __init__(self, sentence):
self.sentence = sentence
self.words = []
self.mood = add_mood_value(self.sentence)
self.length = int
self.domain = str
self.interrogativePackage = None
# Get a list of Word objects contained in the Sentence and put them in taggedWords
for i, word in enumerate(pattern.en.parse(
self.sentence,
tokenize = False,
tags = True,
chunks = True,
relations = True,
lemmata = True,
encoding = 'utf-8'
).split()[0]):
self.words.append(Word(word, i))
self.length = len(self.words)
def __str__(self):
return self.sentence
class Message:
"""
Defines a collection of Sentences and its attributes, auto-generates and fills itself with Sentence objects
Class Variables
message str String representation of the Message
sentences list Ordered list of Sentence objects in the Message
avgMood float Average of the mood value of all the Sentences in the Message
keywords list The message's main topics
sender str The name of the person who sent the message
"""
def __init__(self, message, sender=(u'Anonymous')):
self.message = message
self.sentences = []
self.avgMood = int
self.keywords = []
self.sender = sender
# Get a list of Sentence objects contained in the Message and put them in taggedSentences
for sentence in pattern.en.parse(
self.message,
tokenize = True,
tags = False,
chunks = False,
relations = False,
lemmata = False,
encoding = 'utf-8'
).split('\n'):
self.sentences.append(Sentence(sentence))
# Average Sentence moods and record the value
moods = []
for sentence in self.sentences:
moods.append(sentence.mood)
self.avgMood = sum(moods) / len(moods)
# Find sentences' domains and InterrogativePackages (if applicable)
for sentence in self.sentences:
sentence = wordpatternfinder.find_patterns(sentence)
# Use pattern.vector to find keywords
for keyword in pattern.vector.Document(self.message).keywords():
keyword = pattern.en.lemma(keyword[1])
self.keywords.append(keyword)
# If pattern.vector couldn't find any keywords, use the old method
if self.keywords == []:
logging.warning("No keywords detected by pattern.en. Using old method...")
for sentence in self.sentences:
for word in sentence.words:
if word.partOfSpeech in misc.nounCodes and word.lemma not in self.keywords:
self.keywords.append(word.lemma)
# Check keywords against words that we have in the dictionary
with connection:
cursor.execute('SELECT * FROM dictionary;')
dictionary = []
for row in cursor.fetchall():
dictionary.append(row[0])
for keyword in self.keywords:
if keyword not in dictionary:
logging.debug("Removing unknown word {0} from keyword list".format(keyword))
self.keywords.remove(keyword)
# If we don't have any keywords, that's bad
if self.keywords == []:
logging.error("No keywords detected in message! This will cause a critical failure when we try to reply!")
def __str__(self):
return self.message
def train(message):
"""Read a message as a string, learn from it, store what we learned in the database"""
logging.info("Consuming message...")
message = pronouns.determine_pronoun_references(message)
message = pronouns.determine_posessive_references(message)
logging.info("Looking for new words...")
# Gather words we already know from database
with connection:
cursor.execute('SELECT * FROM dictionary;')
knownWords = []
for row in cursor.fetchall():
knownWords.append((row[0], row[1])) # (lemma, POS)
# Compare them against each word from the message
for sentence in message.sentences:
for word in sentence.words:
if word.partOfSpeech not in misc.trashPOS:
# If it's a word we don't have in the database, add it
#TODO: check the types of word.lemma and knownWord because apparently they aren't the same
if word.lemma not in [knownWord[0] for knownWord in knownWords if word.lemma == knownWord[0]]:
logging.info("Learned new word: \'{0}\'!".format(word.lemma.encode('utf-8', 'ignore')))
logging.debug("Prev. word POS: \'{0}\'".format(word.partOfSpeech))
knownWords.append((word.lemma, word.partOfSpeech))
with connection:
cursor.execute('INSERT INTO dictionary VALUES (?, ?, 0);', (re.escape(word.lemma.encode('utf-8', 'ignore')), word.partOfSpeech))
logging.info("Finding associations...")
associationtrainer.find_associations(message)
def filter_message(messageText):
"""Make it easier for the computer to read messages (and also screen out banned words)"""
# Add punctuation is it isn't already present
if messageText[-1] not in ['!', '?', '.']:
messageText += "."
# Translate internet slang and fix weird parsing stuff
filtered = []
for word in messageText.split(' '):
word = word.decode('utf-8')
# Translate internet abbreviations
if word.lower() in misc.netspeak.keys():
logging.debug("Translating \'{0}\' from net speak...".format(word))
filtered.extend(misc.netspeak[word.lower()])
# Change "n't" to "not"
elif word.lower() in [u"n\'t", u"n\u2019t", u"n\u2018t"]:
logging.debug("Replacing \"n\'t\" with \"not\"...")
filtered.append(u'not')
# Remove "'s"
elif word.lower() == u"\'s":
pass
# Remove double quote characters
elif "\"" in word or u"“" in word or u"”" in word:
pass
else:
filtered.append(word)
filteredText = ' '.join(filtered)
return filteredText
# ooo ooooo o8o ooooo
# `88. .888' `"' `888'
# 888b d'888 .oooo. oooo ooo. .oo. 888 .ooooo. .ooooo. oo.ooooo.
# 8 Y88. .P 888 `P )88b `888 `888P"Y88b 888 d88' `88b d88' `88b 888' `88b
# 8 `888' 888 .oP"888 888 888 888 888 888 888 888 888 888 888
# 8 Y 888 d8( 888 888 888 888 888 o 888 888 888 888 888 888
# o8o o888o `Y888""8o o888o o888o o888o o888ooooood8 `Y8bod8P' `Y8bod8P' 888bod8P'
# 888
# o888o
# TODO: remove class
class Ask:
def __init__(self, message, sender, askid):
self.sender = sender
self.askid = askid
self.message = message.encode('utf-8', 'ignore')
self.message = filter_message(self.message)
self.message = Message(self.message, self.sender)
# Create Mastodon API instance
mastodon = Mastodon(
access_token = 'emma_usercred.secret',
api_base_url = 'https://botsin.space'
)
# Create listener
class Listener(StreamListener):
"""
Listens for Mastodon activity
Class Variables
message str String representation of the Message
sender str Username of person who sent the Message
tootID int ID of the Toot so that we can reply
reply str Emma's reply to the Message
"""
def on_notification(self, status):
if status.type == 'mention':
# Don't reply to bot posts
if status.status.account.bot:
logging.info("Message is from a bot account. Skipping...")
return False
# Get status, remove HTML markup
self.message = status.status.content
self.message = re.sub('<[^<]+?>', '', self.message)
print self.message
# Get other uesful variables
self.sender = status.status.account.username
self.tootID = status.status.id
# TODO: Block nsfw CWs
logging.info("@{0} says: {1}".format(self.sender, self.message))
# Format message for easier manipulation and more accurate understanding
self.message = self.message.encode('utf-8', 'ignore')
self.message = filter_message(self.message)
self.message = Message(self.message, self.sender)
logging.debug("Filtered message: {0}".format(self.message))
# Remove profanity and banned words
logging.debug("Searching for profanity & banned words...")
with open('bannedwords.txt', 'r') as bannedWords:
bannedWords = bannedWords.read()
bannedWords = bannedWords.split('\n')
bannedWords.extend(pattern.en.wordlist.PROFANITY)
bannedWords.remove('gay')
bannedWords.remove('queer')
bannedWords.remove('hell')
for word in self.message.message.split(' '):
if word.lower() in bannedWords:
logging.info("Banned word {0} found in message. Skipping...")
return False
# Learn from and reply to the message
train(self.message)
reply = replybuilder.reply(self.message, calculate_mood())
if reply == False:
# Sentence generation failed
logging.error("Sentence generation failed.")
return False
else:
# Submit reply
self.reply = cgi.escape(reply)
logging.info("Reply: {0}".format(self.reply))
self.reply = reply.encode('utf-8', 'ignore')
logging.debug("Posting status to Mastodon...")
mastodon.status_reply(
to_status = status.status,
status = self.reply
)
return True
else:
return False
if flags.enableDebugMode == False:
# Activate listener
logging.info("Activating listener...")
print mastodon.stream_user(
listener = Listener()
)
else:
# Debug stuff
if flags.useTestingStrings:
inputText = random.choice(flags.testingStrings)
else: inputText = raw_input("Message >> ")
message = Message(filter_message(inputText.encode('utf-8', 'ignore')), "You")
logging.debug("Message: {0}".format(message.message))
train(message)
reply = replybuilder.reply(message, calculate_mood())
if reply == 0:
# Sentence generation failed
pass
else:
print reply