Ejemplo n.º 1
0
 def set_personality(self, personality):
     """Change Markov personality to given table name in Markov database"""
     personality = self.normalize_personality(personality)
     if self.is_personality_valid(personality):
         self.markov = markov.PostgresMarkov(self.dbconn, personality,
                                             case_sensitive=False)
         self.personality = personality
         self._set_personality_config(personality)
     else:
         raise Bad_personality_error
Ejemplo n.º 2
0
def update(pname, reset, infile, update_datestamp):
    try:
        pregex = personality_regexes[pname]
    except KeyError:
        raise BadPersonalityError

    stdout.write('Starting {} Markov generation.\n'.format(pname))

    conn = psycopg2.connect('dbname=markovmix user=markovmix')
    mk = markov.PostgresMarkov(conn, pname, case_sensitive=False)
    corpus = TrainingCorpus(pregex, mk)

    mk.begin()

    for line in infile:
        line = line.strip()
        if line:
            m = re.match(r'^<(.*?)> ?(.*)', line, re.I)
            if m:
                corpus.check_line(m.group(1), m.group(2))
        else:
            corpus.new_context()

    if reset:
        # Write Markov data

        mk.doquery('SELECT COUNT(*) FROM "{}"'.format(mk.table_name))
        old_row_count = mk.cursor.fetchone()[0]
        stdout.write('Current Markov row count: %d\n' % old_row_count)

        mk.clear()

        stdout.write('Reinitializing tables...\n')
        mk.doquery('DROP TABLE IF EXISTS ".markov.old"')
        mk.doquery('DROP TABLE IF EXISTS ".context.old"')
        mk.doquery('ALTER TABLE "{}" RENAME TO ".markov.old"'.format(
            mk.table_name))
        mk.doquery('ALTER TABLE "{}" RENAME TO ".context.old"'.format(
            mk.context_table_name))
        mk.create_tables()

    for progress, rows in corpus.markov_rows():
        mk.add_markov_rows(rows)
        stdout.write('Inserting Markov data {}/{}...\r'.format(
            progress[0], progress[1]))
        stdout.flush()
    stdout.write('\n')

    # Write context data
    for progress, rows in corpus.context_rows(PROGRESS_EVERY if reset else 1):
        if reset:
            mk.cursor.executemany(
                'INSERT INTO "{}" (inword, outword, freq) VALUES'
                ' (%s, %s, %s)'.format(mk.context_table_name), rows)
        else:
            inword, outword, freq = rows[0]
            mk.add_context(inword, outword, freq)
        stdout.write('Inserting context data {}/{}...\r'.format(
            progress[0], progress[1]))
        stdout.flush()

    stdout.write('\n')
    if reset:
        stdout.write('Indexing tables...\n')
        mk.index_tables()
        mk.doquery('SELECT COUNT(*) FROM "{}"'.format(mk.table_name))
        new_row_count = mk.cursor.fetchone()[0]
        row_count_increase = new_row_count - old_row_count
        stdout.write('New Markov row count: %d\n' % new_row_count)
        if old_row_count:
            stdout.write('Row count change: %+d (%d%%)\n' %
                         (row_count_increase,
                          round(row_count_increase / old_row_count * 100)))

    # Update last-updated date if enabled (will only be written to DB if
    # entire process finishes to the commit call at the end of the
    # function)
    if update_datestamp:
        mk.doquery('UPDATE ".last-updated" SET updated = NOW() WHERE name=%s',
                   (pname, ))
        if not mk.cursor.rowcount:
            mk.doquery('INSERT INTO ".last-updated" VALUES (%s)', (pname, ))
    else:
        stdout.write('Skipping datestamp update.\n')

    stdout.write('Closing...\n')
    mk.commit()
    conn.close()
    stdout.write('Finished!\n\n')
Ejemplo n.º 3
0
def output_corpus(pname, reset):
    NEVER_UPDATED = datetime(1970, 1, 1, 0, 0)
    home = os.environ['HOME']
    try:
        pregex = personality_regexes[pname]
    except KeyError:
        raise BadPersonalityError

    stderr.write('Starting {} corpus search.\n'.format(pname))

    # Get last updated date
    conn = psycopg2.connect('dbname=markovmix user=markovmix')
    mk = markov.PostgresMarkov(conn,
                               pname,
                               case_sensitive=False,
                               create_tables=False)
    mk.begin()
    mk.doquery(
        'CREATE TABLE IF NOT EXISTS ".last-updated" '
        '(name VARCHAR PRIMARY KEY, updated TIMESTAMP NOT NULL DEFAULT NOW())')
    mk.doquery('SELECT updated FROM ".last-updated" WHERE name=%s', (pname, ))
    target_date = datetime.now()
    if not reset and mk.cursor.rowcount:
        last_updated = mk.cursor.fetchone()[0]
    else:
        last_updated = NEVER_UPDATED

    if reset:

        ## Never updated yet ##
        stderr.write('Parsing old logs...\n')

        # Parse old logs this first time only

        # Old Konversation logs
        for fn in [
                os.path.join('log_irc_konversation', x)
                for x in ('calcgames.log', 'cemetech.log', 'tcpa.log',
                          'ti.log', 'efnet_#tiasm.log', 'omnimaga.log')
        ]:
            with open(os.path.join(home, fn), 'r') as f:
                for line in f:
                    line = line.strip()
                    m = re.match(
                        r'^\[.*\] \[.*\] <saxjax>\t\(.\) \[?(.*?)[:\]] (.*)',
                        line, re.I)
                    if not m:
                        m = re.match(r'^\[.*\] \[.*\] <(.*?)>\t(.*)', line,
                                     re.I)
                    if m:
                        print_line(m.group(1), m.group(2))
            print_context_break()

        # Old #tcpa logs from elsewhere
        log_path = os.path.join('/home/tcparetro',
                                os.path.join('log_irc_retro'))
        for dn in [
                os.path.join(log_path, x) for x in sorted(os.listdir(log_path))
        ]:
            for fn in sorted(os.listdir(dn)):
                with open(os.path.join(log_path, os.path.join(dn, fn)),
                          'r') as f:
                    for line in f:
                        line = line.strip()
                        m = re.match(
                            r'^\[[0-9]{2}:[0-9]{2}:[0-9]{2}\] <[ @+]?(.*?)> (.*)',
                            line, re.I)
                        if m:
                            print_line(m.group(1), m.group(2))
        print_context_break()

        # Old #calcgames logs from elsewhere
        log_path = os.path.join('/home/tcparetro',
                                os.path.join('log_calcgames'))
        for fn in sorted(os.listdir(log_path)):
            with open(os.path.join(log_path, fn), 'r') as f:
                for line in f:
                    line = line.strip()
                    m = re.match(
                        r'^[0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)',
                        line, re.I)
                    if m:
                        print_line(m.group(1), m.group(2))
        print_context_break()

        # More miscellaneous junk I threw in a separate huge file because it
        # was too scattered around my system
        with open('misc_irc_lines.txt', 'r') as f:
            for line in f:
                line = line.strip()
                if pregex[1]:
                    m = re.match(
                        r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?saxjax> (.*?): (.*)',
                        line, re.I)
                if not m:
                    m = re.match(
                        r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?(.*?)> (.*)',
                        line, re.I)
                if m:
                    print_line(m.group(2), m.group(3))
        print_context_break()

        # Stuff from elsewhere or not in my logs that I wanted to add
        log_path = [
            os.path.join('manual_corpus', x)
            for x in os.listdir('manual_corpus') if x.endswith('.txt')
            and not x.startswith('.') and not x.startswith('#')
        ]
        for fn in log_path:
            with open(fn, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        m = re.match(r'^<(.*?)> (.*)', line, re.I)
                        if m:
                            print_line(m.group(1), m.group(2))
                    else:
                        print_context_break()
            print_context_break()

        # irssi logs
        log_path = os.path.join(home, os.path.join('log_irc_irssi'))
        for dn in [os.path.join(log_path, x) for x in os.listdir(log_path)]:
            try:
                last_channel = None
                for fn in sorted(os.listdir(dn)):
                    fm = re.match(
                        '#(.*)_([0-9]{4})-([0-9]{2})-([0-9]{2})\.log', fn)
                    if fm:
                        channel, year, month, day = fm.groups()
                        if (channel
                                in ('calcgames', 'cemetech', 'flood', 'hp48',
                                    'inspired', 'nspire-lua', 'prizm', 'tcpa',
                                    'ti', 'caleb', 'wikiti', 'markov')):
                            if channel != last_channel:
                                print_context_break()
                                last_channel = channel
                            with open(os.path.join(log_path, dn, fn),
                                      'r') as f:
                                for line in f:
                                    line = line.strip()
                                    m = re.match(
                                        r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?saxjax> \(.\) \[?(.*?)[:\]] (.*)',
                                        line, re.I)
                                    if not m:
                                        m = re.match(
                                            r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?omnomirc.?> (?:\(.\))?<(.*?)> (.*)',
                                            line, re.I)
                                    if not m:
                                        m = re.match(
                                            r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)',
                                            line, re.I)
                                    if m:
                                        nick, msg = m.groups()

                                        # Special case to handle our silly
                                        # nikky/nikkybot nick-swapping stunt
                                        if datetime(year=int(year),
                                                    month=int(month),
                                                    day=int(day)) >= datetime(
                                                        2014, 3, 9):
                                            if nick.lower().startswith(
                                                    'nikkybot'):
                                                nick = 'nikky'
                                            elif nick.lower().startswith(
                                                    'nikky'):
                                                nick = 'nikkybot'

                                        print_line(nick, msg)

            except OSError as e:
                if e.errno == 20:
                    continue
        print_context_break()

    # Parse current weechat logs
    stderr.write('Parsing current logs...\n')
    for fn in [
            os.path.join('log_irc_weechat', 'irc.efnet.#' + x + '.weechatlog')
            for x in ('calcgames', 'cemetech', 'tcpa', 'ti', 'omnimaga',
                      'flood', 'caleb', 'caleb-spam', 'hp48', 'markov',
                      'nspired', 'nspire-lua', 'prizm', 'wikiti',
                      'cemetech-mc', 'codewalrus', 'gbadev', 'kinginfinity',
                      'thebutton', 'thebuttondev')
    ]:
        with open(os.path.join(home, fn), 'r') as f:
            for line in f:
                line = line.strip()

                m1 = re.match(
                    r'^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\t[+@]?(.*?)\t(.*)',
                    line)
                m2 = re.match(
                    r'^(..., [0-9]{2} ... [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}) [-+][0-9]{4}\t[+@]?(.*?)\t(.*)',
                    line)
                if m1:
                    date, nick, msg = m1.groups()
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                elif m2:
                    date, nick, msg = m2.groups()
                    date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S')
                else:
                    continue
                # Okay, Weechat's default log format kind of stinks for parsing.
                if not nick or nick in ('  <==  ', '  ==>  ', '  ---  ', ' * ',
                                        '[chanstat]'):
                    continue

                # Special case to handle our silly nikky/nikkybot nick-swapping
                #   stunt
                if date < datetime(year=2014, month=5, day=2):
                    if nick.lower().startswith('nikkybot'):
                        nick = 'nikky'
                    elif nick.lower().startswith('nikky'):
                        nick = 'nikkybot'

                if date < last_updated or date > target_date:
                    continue
                if (nick.lower().startswith('saxjax')
                        or nick.lower().startswith('cemetecmc')):
                    m = re.match(r'^(?:\(.\) )?\[(.*?)\] (.*)', msg, re.I)
                elif nick.lower().startswith('omnomirc'):
                    m = re.match(r'^(?:\(.\))?<(.*?)> (.*)', msg, re.I)
                elif (nick.lower().startswith('walriibot')
                      or nick.lower().startswith('wb')
                      or nick.lower().startswith('i|')
                      or nick.lower().startswith('l|')
                      or nick.lower().startswith('j|')
                      or nick.lower().startswith('yukitg')):
                    m = re.match(
                        r'^(?:\(.*?\))?(?:<(?:[ijl]\||yukitg)> )?<(.*?)> (.*)',
                        msg, re.I)
                else:
                    m = None
                if m:
                    nick, msg = m.group(1), m.group(2)
                print_line(nick, msg)
        print_context_break()

    mk.commit()
    conn.close()
    stderr.write('Finished!\n\n')
Ejemplo n.º 4
0
def update(pname, reset):
    NEVER_UPDATED = datetime(1970, 1, 1, 0, 0)
    home = os.environ['HOME']
    try:
        pregex = personality_regexes[pname]
    except KeyError:
        raise BadPersonalityError

    stdout.write('Starting {} Markov generation.\n'.format(pname))

    # Get last updated date
    conn = psycopg2.connect('dbname=markovmix user=markovmix')
    mk = markov.PostgresMarkov(conn, pname, case_sensitive=False)
    mk.begin()
    mk.doquery(
        'CREATE TABLE IF NOT EXISTS ".last-updated" '
        '(name VARCHAR PRIMARY KEY, updated TIMESTAMP NOT NULL DEFAULT NOW())')
    mk.doquery('SELECT updated FROM ".last-updated" WHERE name=%s', (pname, ))
    target_date = datetime.now()
    if not reset and mk.cursor.rowcount:
        last_updated = mk.cursor.fetchone()[0]
    else:
        last_updated = NEVER_UPDATED
    # Updated last updated date (will only be written to DB if entire process
    # finishes to the commit call at the end of the script)
    mk.doquery('UPDATE ".last-updated" SET updated = NOW() WHERE name=%s',
               (pname, ))
    if not mk.cursor.rowcount:
        mk.doquery('INSERT INTO ".last-updated" VALUES (%s)', (pname, ))

    corpus = TrainingCorpus(pregex, mk)

    if reset:

        ## Never updated yet ##
        stdout.write('Parsing old logs...\n')

        # Parse old logs this first time only

        # Old Konversation logs
        for fn in [
                os.path.join('log_irc_konversation', x)
                for x in ('calcgames.log', 'cemetech.log', 'tcpa.log',
                          'ti.log', 'efnet_#tiasm.log', 'omnimaga.log')
        ]:
            with open(os.path.join(home, fn), 'r') as f:
                for line in f:
                    line = line.strip()
                    m = re.match(r'^\[.*\] \[.*\] <(.*?)>\t(.*)', line, re.I)
                    if not m and pregex[1]:
                        m = re.match(
                            r'^\[.*\] \[.*\] <saxjax>\t\(.\) \[?(.*?)[:\]] (.*)',
                            line, re.I)
                    if m:
                        corpus.check_line(m.group(1), m.group(2))
            corpus.new_context()

        # Old #tcpa logs from elsewhere
        log_path = os.path.join('/home/tcparetro',
                                os.path.join('log_irc_retro'))
        for dn in [
                os.path.join(log_path, x) for x in sorted(os.listdir(log_path))
        ]:
            for fn in sorted(os.listdir(dn)):
                with open(os.path.join(log_path, os.path.join(dn, fn)),
                          'r') as f:
                    for line in f:
                        line = line.strip()
                        m = re.match(
                            r'^\[[0-9]{2}:[0-9]{2}:[0-9]{2}\] <[ @+]?(.*?)> (.*)',
                            line, re.I)
                        if m:
                            corpus.check_line(m.group(1), m.group(2))
        corpus.new_context()

        # Old #calcgames logs from elsewhere
        log_path = os.path.join('/home/tcparetro',
                                os.path.join('log_calcgames'))
        for fn in sorted(os.listdir(log_path)):
            with open(os.path.join(log_path, fn), 'r') as f:
                for line in f:
                    line = line.strip()
                    m = re.match(
                        r'^[0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)',
                        line, re.I)
                    if m:
                        corpus.check_line(m.group(1), m.group(2))
        corpus.new_context()

        # More miscellaneous junk I threw in a separate huge file because it
        # was too scattered around my system
        with open('misc_irc_lines.txt', 'r') as f:
            for line in f:
                line = line.strip()
                m = re.match(
                    r'^\[?[0-9]{2}:[0-9]{2}(:[0-9]{2})?\]? <[ @+]?(.*?)> (.*)',
                    line, re.I)
                if m:
                    corpus.check_line(m.group(2), m.group(3))
        corpus.new_context()

        # Stuff from elsewhere or not in my logs that I wanted to add
        log_path = [
            os.path.join('manual_corpus', x)
            for x in os.listdir('manual_corpus') if x.endswith('.txt')
            and not x.startswith('.') and not x.startswith('#')
        ]
        for fn in log_path:
            with open(fn, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        m = re.match(r'^<(.*?)> (.*)', line, re.I)
                        if m:
                            corpus.check_line(m.group(1), m.group(2))
                    else:
                        corpus.new_context()
            corpus.new_context()

        # irssi logs
        log_path = os.path.join(home, os.path.join('log_irc_irssi'))
        for dn in [os.path.join(log_path, x) for x in os.listdir(log_path)]:
            try:
                last_channel = None
                for fn in sorted(os.listdir(dn)):
                    m = re.match('#(.*)_([0-9]{4})-([0-9]{2})-([0-9]{2})\.log',
                                 fn)
                    if m:
                        channel, year, month, day = m.groups()
                        if (channel
                                in ('calcgames', 'cemetech', 'flood', 'hp48',
                                    'inspired', 'nspire-lua', 'prizm', 'tcpa',
                                    'ti', 'caleb', 'wikiti', 'markov')):
                            if channel != last_channel:
                                corpus.new_context()
                                last_channel = channel
                            with open(os.path.join(log_path, dn, fn),
                                      'r') as f:
                                for line in f:
                                    line = line.strip()
                                    m = re.match(
                                        r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?(.*?)> (.*)',
                                        line, re.I)
                                    if m:
                                        nick, msg = m.groups()

                                        # Special case to handle our silly
                                        # nikky/nikkybot nick-swapping stunt
                                        if datetime(year=int(year),
                                                    month=int(month),
                                                    day=int(day)) >= datetime(
                                                        2014, 3, 9):
                                            if nick.lower().startswith(
                                                    'nikkybot'):
                                                nick = 'nikky'
                                            elif nick.lower().startswith(
                                                    'nikky'):
                                                nick = 'nikkybot'

                                        corpus.check_line(nick, msg)

                                    if pregex[1]:
                                        m = re.match(
                                            r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?saxjax> \(.\) \[?(.*?)[:\]] (.*)',
                                            line, re.I)
                                        if m:
                                            corpus.check_line(
                                                m.group(1), m.group(2))
                                        elif pregex[2]:
                                            m = re.match(
                                                r'^[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} <[ @+]?omnomirc.?> (?:\(.\))?<(.*?)> (.*)',
                                                line, re.I)
                                            if m:
                                                corpus.check_line(
                                                    m.group(1), m.group(2))

            except OSError as e:
                if e.errno == 20:
                    continue
        corpus.new_context()

    # Parse current weechat logs
    stdout.write('Parsing current logs...\n')
    for fn in [
            os.path.join('log_irc_weechat', 'irc.efnet.#' + x + '.weechatlog')
            for x in ('calcgames', 'cemetech', 'tcpa', 'ti', 'omnimaga',
                      'flood', 'caleb', 'caleb-spam', 'hp48', 'markov',
                      'nspired', 'nspire-lua', 'prizm', 'wikiti',
                      'cemetech-mc', 'codewalrus', 'gbadev', 'kinginfinity')
    ]:
        with open(os.path.join(home, fn), 'r') as f:
            for line in f:
                line = line.strip()

                m1 = re.match(
                    r'^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\t[+@]?(.*?)\t(.*)',
                    line)
                m2 = re.match(
                    r'^(..., [0-9]{2} ... [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}) [-+][0-9]{4}\t[+@]?(.*?)\t(.*)',
                    line)
                if m1:
                    date, nick, msg = m1.groups()
                    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                elif m2:
                    date, nick, msg = m2.groups()
                    date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S')
                else:
                    continue

                # Special case to handle our silly nikky/nikkybot nick-swapping
                #   stunt
                if date < datetime(year=2014, month=5, day=2):
                    if nick.lower().startswith('nikkybot'):
                        nick = 'nikky'
                    elif nick.lower().startswith('nikky'):
                        nick = 'nikkybot'

                if date < last_updated or date > target_date:
                    continue
                if pregex[1] and (nick.lower().startswith('saxjax')
                                  or nick.lower().startswith('cemetecmc')):
                    m = re.match(r'^\(.\) \[?(.*?)[:\]] (.*)', msg, re.I)
                    if not m:
                        m = re.match(r'^(?:\(.\) )?(?:[[*](.*?)[]]?) (.*)',
                                     msg, re.I)
                elif pregex[2] and nick.lower().startswith('omnomnirc'):
                    m = re.match(r'^(?:\(.\))?<(.*?)> (.*)', msg, re.I)
                elif pregex[3] and (nick.lower().startswith('walriibot')
                                    or nick.lower().startswith('wb')
                                    or nick.lower().startswith('i|')
                                    or nick.lower().startswith('l|')
                                    or nick.lower().startswith('yukitg')):
                    m = re.match(r'^(?:\(.*?\))?<(.*?)> (.*)', msg, re.I)
                else:
                    m = None
                if m:
                    nick, msg = m.group(1), m.group(2)
                corpus.check_line(nick, msg)
        corpus.new_context()

    if reset:
        mk.clear()

    # Write Markov data
    if reset:
        stdout.write('Reinitializing tables...\n')
        mk.doquery('DROP TABLE IF EXISTS ".markov.old"')
        mk.doquery('DROP TABLE IF EXISTS ".context.old"')
        mk.doquery('ALTER TABLE "{}" RENAME TO ".markov.old"'.format(
            mk.table_name))
        mk.doquery('ALTER TABLE "{}" RENAME TO ".context.old"'.format(
            mk.context_table_name))
        mk.create_tables()

    for progress, rows in corpus.markov_rows():
        mk.add_markov_rows(rows)
        stdout.write('Inserting Markov data {}/{}...\r'.format(
            progress[0], progress[1]))
        stdout.flush()
    stdout.write('\n')

    # Write context data
    for progress, rows in corpus.context_rows(PROGRESS_EVERY if reset else 1):
        if reset:
            mk.cursor.executemany(
                'INSERT INTO "{}" (inword, outword, freq) VALUES'
                ' (%s, %s, %s)'.format(mk.context_table_name), rows)
        else:
            inword, outword, freq = rows[0]
            mk.add_context(inword, outword, freq)
        stdout.write('Inserting context data {}/{}...\r'.format(
            progress[0], progress[1]))
        stdout.flush()

    stdout.write('\n')
    if reset:
        stdout.write('Indexing tables...\n')
        mk.index_tables()

    stdout.write('Closing...\n')
    mk.commit()
    conn.close()
    stdout.write('Finished!\n\n')