def set_personality(self, personality): """Change Markov personality to given table name in Markov database""" personality = self.normalize_personality(personality) if self.is_personality_valid(personality): self.markov = markov.PostgresMarkov(self.dbconn, personality, case_sensitive=False) self.personality = personality self._set_personality_config(personality) else: raise Bad_personality_error
def update(pname, reset, infile, update_datestamp): try: pregex = personality_regexes[pname] except KeyError: raise BadPersonalityError stdout.write('Starting {} Markov generation.\n'.format(pname)) conn = psycopg2.connect('dbname=markovmix user=markovmix') mk = markov.PostgresMarkov(conn, pname, case_sensitive=False) corpus = TrainingCorpus(pregex, mk) mk.begin() for line in infile: line = line.strip() if line: m = re.match(r'^<(.*?)> ?(.*)', line, re.I) if m: corpus.check_line(m.group(1), m.group(2)) else: corpus.new_context() if reset: # Write Markov data mk.doquery('SELECT COUNT(*) FROM "{}"'.format(mk.table_name)) old_row_count = mk.cursor.fetchone()[0] stdout.write('Current Markov row count: %d\n' % old_row_count) mk.clear() stdout.write('Reinitializing tables...\n') mk.doquery('DROP TABLE IF EXISTS ".markov.old"') mk.doquery('DROP TABLE IF EXISTS ".context.old"') mk.doquery('ALTER TABLE "{}" RENAME TO ".markov.old"'.format( mk.table_name)) mk.doquery('ALTER TABLE "{}" RENAME TO ".context.old"'.format( mk.context_table_name)) mk.create_tables() for progress, rows in corpus.markov_rows(): mk.add_markov_rows(rows) stdout.write('Inserting Markov data {}/{}...\r'.format( progress[0], progress[1])) stdout.flush() stdout.write('\n') # Write context data for progress, rows in corpus.context_rows(PROGRESS_EVERY if reset else 1): if reset: mk.cursor.executemany( 'INSERT INTO "{}" (inword, outword, freq) VALUES' ' (%s, %s, %s)'.format(mk.context_table_name), rows) else: inword, outword, freq = rows[0] mk.add_context(inword, outword, freq) stdout.write('Inserting context data {}/{}...\r'.format( progress[0], progress[1])) stdout.flush() stdout.write('\n') if reset: stdout.write('Indexing tables...\n') mk.index_tables() mk.doquery('SELECT COUNT(*) FROM "{}"'.format(mk.table_name)) new_row_count = mk.cursor.fetchone()[0] row_count_increase = new_row_count - old_row_count stdout.write('New Markov row count: %d\n' % new_row_count) if old_row_count: stdout.write('Row count change: %+d (%d%%)\n' % (row_count_increase, round(row_count_increase / old_row_count * 100))) # Update last-updated date if enabled (will only be written to DB if # entire process finishes to the commit call at the end of the # function) if update_datestamp: mk.doquery('UPDATE ".last-updated" SET updated = NOW() WHERE name=%s', (pname, )) if not mk.cursor.rowcount: mk.doquery('INSERT INTO ".last-updated" VALUES (%s)', (pname, )) else: stdout.write('Skipping datestamp update.\n') stdout.write('Closing...\n') mk.commit() conn.close() stdout.write('Finished!\n\n')
def output_corpus(pname, reset): NEVER_UPDATED = datetime(1970, 1, 1, 0, 0) home = os.environ['HOME'] try: pregex = personality_regexes[pname] except KeyError: raise BadPersonalityError stderr.write('Starting {} corpus search.\n'.format(pname)) # Get last updated date conn = psycopg2.connect('dbname=markovmix user=markovmix') mk = markov.PostgresMarkov(conn, pname, case_sensitive=False, create_tables=False) mk.begin() mk.doquery( 'CREATE TABLE IF NOT EXISTS ".last-updated" ' '(name VARCHAR PRIMARY KEY, updated TIMESTAMP NOT NULL DEFAULT NOW())') mk.doquery('SELECT updated FROM ".last-updated" WHERE name=%s', (pname, )) target_date = datetime.now() if not reset and mk.cursor.rowcount: last_updated = mk.cursor.fetchone()[0] else: last_updated = NEVER_UPDATED if reset: ## Never updated yet ## stderr.write('Parsing old logs...\n') # Parse old logs this first time only # Old Konversation logs for fn in [ os.path.join('log_irc_konversation', x) for x in ('calcgames.log', 'cemetech.log', 'tcpa.log', 'ti.log', 'efnet_#tiasm.log', 'omnimaga.log') ]: with open(os.path.join(home, fn), 'r') as f: for line in f: line = line.strip() m = re.match( r'^\[.*\] \[.*\] <saxjax>\t\(.\) \[?(.*?)[:\]] (.*)', line, re.I) if not m: m = re.match(r'^\[.*\] \[.*\] <(.*?)>\t(.*)', line, re.I) if m: print_line(m.group(1), m.group(2)) print_context_break() # Old #tcpa logs from elsewhere log_path = os.path.join('/home/tcparetro', os.path.join('log_irc_retro')) for dn in [ os.path.join(log_path, x) for x in sorted(os.listdir(log_path)) ]: for fn in sorted(os.listdir(dn)): with open(os.path.join(log_path, os.path.join(dn, fn)), 'r') as f: for line in f: line = line.strip() m = re.match( r'^\[[0-9]{2}:[0-9]{2}:[0-9]{2}\] <[ @+]?(.*?)> (.*)', line, re.I) if m: print_line(m.group(1), m.group(2)) print_context_break() # Old #calcgames logs from elsewhere log_path = os.path.join('/home/tcparetro', os.path.join('log_calcgames')) for fn in sorted(os.listdir(log_path)): with open(os.path.join(log_path, fn), 'r') as f: for line in f: line = line.strip() m = re.match( r'^[0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)', line, re.I) if m: print_line(m.group(1), m.group(2)) print_context_break() # More miscellaneous junk I threw in a separate huge file because it # was too scattered around my system with open('misc_irc_lines.txt', 'r') as f: for line in f: line = line.strip() if pregex[1]: m = re.match( r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?saxjax> (.*?): (.*)', line, re.I) if not m: m = re.match( r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?(.*?)> (.*)', line, re.I) if m: print_line(m.group(2), m.group(3)) print_context_break() # Stuff from elsewhere or not in my logs that I wanted to add log_path = [ os.path.join('manual_corpus', x) for x in os.listdir('manual_corpus') if x.endswith('.txt') and not x.startswith('.') and not x.startswith('#') ] for fn in log_path: with open(fn, 'r') as f: for line in f: line = line.strip() if line: m = re.match(r'^<(.*?)> (.*)', line, re.I) if m: print_line(m.group(1), m.group(2)) else: print_context_break() print_context_break() # irssi logs log_path = os.path.join(home, os.path.join('log_irc_irssi')) for dn in [os.path.join(log_path, x) for x in os.listdir(log_path)]: try: last_channel = None for fn in sorted(os.listdir(dn)): fm = re.match( '#(.*)_([0-9]{4})-([0-9]{2})-([0-9]{2})\.log', fn) if fm: channel, year, month, day = fm.groups() if (channel in ('calcgames', 'cemetech', 'flood', 'hp48', 'inspired', 'nspire-lua', 'prizm', 'tcpa', 'ti', 'caleb', 'wikiti', 'markov')): if channel != last_channel: print_context_break() last_channel = channel with open(os.path.join(log_path, dn, fn), 'r') as f: for line in f: line = line.strip() m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?saxjax> \(.\) \[?(.*?)[:\]] (.*)', line, re.I) if not m: m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?omnomirc.?> (?:\(.\))?<(.*?)> (.*)', line, re.I) if not m: m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)', line, re.I) if m: nick, msg = m.groups() # Special case to handle our silly # nikky/nikkybot nick-swapping stunt if datetime(year=int(year), month=int(month), day=int(day)) >= datetime( 2014, 3, 9): if nick.lower().startswith( 'nikkybot'): nick = 'nikky' elif nick.lower().startswith( 'nikky'): nick = 'nikkybot' print_line(nick, msg) except OSError as e: if e.errno == 20: continue print_context_break() # Parse current weechat logs stderr.write('Parsing current logs...\n') for fn in [ os.path.join('log_irc_weechat', 'irc.efnet.#' + x + '.weechatlog') for x in ('calcgames', 'cemetech', 'tcpa', 'ti', 'omnimaga', 'flood', 'caleb', 'caleb-spam', 'hp48', 'markov', 'nspired', 'nspire-lua', 'prizm', 'wikiti', 'cemetech-mc', 'codewalrus', 'gbadev', 'kinginfinity', 'thebutton', 'thebuttondev') ]: with open(os.path.join(home, fn), 'r') as f: for line in f: line = line.strip() m1 = re.match( r'^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\t[+@]?(.*?)\t(.*)', line) m2 = re.match( r'^(..., [0-9]{2} ... [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}) [-+][0-9]{4}\t[+@]?(.*?)\t(.*)', line) if m1: date, nick, msg = m1.groups() date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') elif m2: date, nick, msg = m2.groups() date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S') else: continue # Okay, Weechat's default log format kind of stinks for parsing. if not nick or nick in (' <== ', ' ==> ', ' --- ', ' * ', '[chanstat]'): continue # Special case to handle our silly nikky/nikkybot nick-swapping # stunt if date < datetime(year=2014, month=5, day=2): if nick.lower().startswith('nikkybot'): nick = 'nikky' elif nick.lower().startswith('nikky'): nick = 'nikkybot' if date < last_updated or date > target_date: continue if (nick.lower().startswith('saxjax') or nick.lower().startswith('cemetecmc')): m = re.match(r'^(?:\(.\) )?\[(.*?)\] (.*)', msg, re.I) elif nick.lower().startswith('omnomirc'): m = re.match(r'^(?:\(.\))?<(.*?)> (.*)', msg, re.I) elif (nick.lower().startswith('walriibot') or nick.lower().startswith('wb') or nick.lower().startswith('i|') or nick.lower().startswith('l|') or nick.lower().startswith('j|') or nick.lower().startswith('yukitg')): m = re.match( r'^(?:\(.*?\))?(?:<(?:[ijl]\||yukitg)> )?<(.*?)> (.*)', msg, re.I) else: m = None if m: nick, msg = m.group(1), m.group(2) print_line(nick, msg) print_context_break() mk.commit() conn.close() stderr.write('Finished!\n\n')
def update(pname, reset): NEVER_UPDATED = datetime(1970, 1, 1, 0, 0) home = os.environ['HOME'] try: pregex = personality_regexes[pname] except KeyError: raise BadPersonalityError stdout.write('Starting {} Markov generation.\n'.format(pname)) # Get last updated date conn = psycopg2.connect('dbname=markovmix user=markovmix') mk = markov.PostgresMarkov(conn, pname, case_sensitive=False) mk.begin() mk.doquery( 'CREATE TABLE IF NOT EXISTS ".last-updated" ' '(name VARCHAR PRIMARY KEY, updated TIMESTAMP NOT NULL DEFAULT NOW())') mk.doquery('SELECT updated FROM ".last-updated" WHERE name=%s', (pname, )) target_date = datetime.now() if not reset and mk.cursor.rowcount: last_updated = mk.cursor.fetchone()[0] else: last_updated = NEVER_UPDATED # Updated last updated date (will only be written to DB if entire process # finishes to the commit call at the end of the script) mk.doquery('UPDATE ".last-updated" SET updated = NOW() WHERE name=%s', (pname, )) if not mk.cursor.rowcount: mk.doquery('INSERT INTO ".last-updated" VALUES (%s)', (pname, )) corpus = TrainingCorpus(pregex, mk) if reset: ## Never updated yet ## stdout.write('Parsing old logs...\n') # Parse old logs this first time only # Old Konversation logs for fn in [ os.path.join('log_irc_konversation', x) for x in ('calcgames.log', 'cemetech.log', 'tcpa.log', 'ti.log', 'efnet_#tiasm.log', 'omnimaga.log') ]: with open(os.path.join(home, fn), 'r') as f: for line in f: line = line.strip() m = re.match(r'^\[.*\] \[.*\] <(.*?)>\t(.*)', line, re.I) if not m and pregex[1]: m = re.match( r'^\[.*\] \[.*\] <saxjax>\t\(.\) \[?(.*?)[:\]] (.*)', line, re.I) if m: corpus.check_line(m.group(1), m.group(2)) corpus.new_context() # Old #tcpa logs from elsewhere log_path = os.path.join('/home/tcparetro', os.path.join('log_irc_retro')) for dn in [ os.path.join(log_path, x) for x in sorted(os.listdir(log_path)) ]: for fn in sorted(os.listdir(dn)): with open(os.path.join(log_path, os.path.join(dn, fn)), 'r') as f: for line in f: line = line.strip() m = re.match( r'^\[[0-9]{2}:[0-9]{2}:[0-9]{2}\] <[ @+]?(.*?)> (.*)', line, re.I) if m: corpus.check_line(m.group(1), m.group(2)) corpus.new_context() # Old #calcgames logs from elsewhere log_path = os.path.join('/home/tcparetro', os.path.join('log_calcgames')) for fn in sorted(os.listdir(log_path)): with open(os.path.join(log_path, fn), 'r') as f: for line in f: line = line.strip() m = re.match( r'^[0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)', line, re.I) if m: corpus.check_line(m.group(1), m.group(2)) corpus.new_context() # More miscellaneous junk I threw in a separate huge file because it # was too scattered around my system with open('misc_irc_lines.txt', 'r') as f: for line in f: line = line.strip() m = re.match( r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?(.*?)> (.*)', line, re.I) if m: corpus.check_line(m.group(2), m.group(3)) corpus.new_context() # Stuff from elsewhere or not in my logs that I wanted to add log_path = [ os.path.join('manual_corpus', x) for x in os.listdir('manual_corpus') if x.endswith('.txt') and not x.startswith('.') and not x.startswith('#') ] for fn in log_path: with open(fn, 'r') as f: for line in f: line = line.strip() if line: m = re.match(r'^<(.*?)> (.*)', line, re.I) if m: corpus.check_line(m.group(1), m.group(2)) else: corpus.new_context() corpus.new_context() # irssi logs log_path = os.path.join(home, os.path.join('log_irc_irssi')) for dn in [os.path.join(log_path, x) for x in os.listdir(log_path)]: try: last_channel = None for fn in sorted(os.listdir(dn)): m = re.match('#(.*)_([0-9]{4})-([0-9]{2})-([0-9]{2})\.log', fn) if m: channel, year, month, day = m.groups() if (channel in ('calcgames', 'cemetech', 'flood', 'hp48', 'inspired', 'nspire-lua', 'prizm', 'tcpa', 'ti', 'caleb', 'wikiti', 'markov')): if channel != last_channel: corpus.new_context() last_channel = channel with open(os.path.join(log_path, dn, fn), 'r') as f: for line in f: line = line.strip() m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)', line, re.I) if m: nick, msg = m.groups() # Special case to handle our silly # nikky/nikkybot nick-swapping stunt if datetime(year=int(year), month=int(month), day=int(day)) >= datetime( 2014, 3, 9): if nick.lower().startswith( 'nikkybot'): nick = 'nikky' elif nick.lower().startswith( 'nikky'): nick = 'nikkybot' corpus.check_line(nick, msg) if pregex[1]: m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?saxjax> \(.\) \[?(.*?)[:\]] (.*)', line, re.I) if m: corpus.check_line( m.group(1), m.group(2)) elif pregex[2]: m = re.match( r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?omnomirc.?> (?:\(.\))?<(.*?)> (.*)', line, re.I) if m: corpus.check_line( m.group(1), m.group(2)) except OSError as e: if e.errno == 20: continue corpus.new_context() # Parse current weechat logs stdout.write('Parsing current logs...\n') for fn in [ os.path.join('log_irc_weechat', 'irc.efnet.#' + x + '.weechatlog') for x in ('calcgames', 'cemetech', 'tcpa', 'ti', 'omnimaga', 'flood', 'caleb', 'caleb-spam', 'hp48', 'markov', 'nspired', 'nspire-lua', 'prizm', 'wikiti', 'cemetech-mc', 'codewalrus', 'gbadev', 'kinginfinity') ]: with open(os.path.join(home, fn), 'r') as f: for line in f: line = line.strip() m1 = re.match( r'^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\t[+@]?(.*?)\t(.*)', line) m2 = re.match( r'^(..., [0-9]{2} ... [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}) [-+][0-9]{4}\t[+@]?(.*?)\t(.*)', line) if m1: date, nick, msg = m1.groups() date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') elif m2: date, nick, msg = m2.groups() date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S') else: continue # Special case to handle our silly nikky/nikkybot nick-swapping # stunt if date < datetime(year=2014, month=5, day=2): if nick.lower().startswith('nikkybot'): nick = 'nikky' elif nick.lower().startswith('nikky'): nick = 'nikkybot' if date < last_updated or date > target_date: continue if pregex[1] and (nick.lower().startswith('saxjax') or nick.lower().startswith('cemetecmc')): m = re.match(r'^\(.\) \[?(.*?)[:\]] (.*)', msg, re.I) if not m: m = re.match(r'^(?:\(.\) )?(?:[[*](.*?)[]]?) (.*)', msg, re.I) elif pregex[2] and nick.lower().startswith('omnomnirc'): m = re.match(r'^(?:\(.\))?<(.*?)> (.*)', msg, re.I) elif pregex[3] and (nick.lower().startswith('walriibot') or nick.lower().startswith('wb') or nick.lower().startswith('i|') or nick.lower().startswith('l|') or nick.lower().startswith('yukitg')): m = re.match(r'^(?:\(.*?\))?<(.*?)> (.*)', msg, re.I) else: m = None if m: nick, msg = m.group(1), m.group(2) corpus.check_line(nick, msg) corpus.new_context() if reset: mk.clear() # Write Markov data if reset: stdout.write('Reinitializing tables...\n') mk.doquery('DROP TABLE IF EXISTS ".markov.old"') mk.doquery('DROP TABLE IF EXISTS ".context.old"') mk.doquery('ALTER TABLE "{}" RENAME TO ".markov.old"'.format( mk.table_name)) mk.doquery('ALTER TABLE "{}" RENAME TO ".context.old"'.format( mk.context_table_name)) mk.create_tables() for progress, rows in corpus.markov_rows(): mk.add_markov_rows(rows) stdout.write('Inserting Markov data {}/{}...\r'.format( progress[0], progress[1])) stdout.flush() stdout.write('\n') # Write context data for progress, rows in corpus.context_rows(PROGRESS_EVERY if reset else 1): if reset: mk.cursor.executemany( 'INSERT INTO "{}" (inword, outword, freq) VALUES' ' (%s, %s, %s)'.format(mk.context_table_name), rows) else: inword, outword, freq = rows[0] mk.add_context(inword, outword, freq) stdout.write('Inserting context data {}/{}...\r'.format( progress[0], progress[1])) stdout.flush() stdout.write('\n') if reset: stdout.write('Indexing tables...\n') mk.index_tables() stdout.write('Closing...\n') mk.commit() conn.close() stdout.write('Finished!\n\n')