def create_table(self): # create freq table sql = u'CREATE TABLE IF NOT EXISTS %s ( \ wid INTEGER PRIMARY KEY, freq INTEGER' % self.freq_table for i in range(self.fields): sql = sql + u', ' + self.fieldnames[i] + u' TEXT' sql = sql + u', UNIQUE (' for i in range(self.fields): sql = sql + self.fieldnames[i] + u', ' sql = sql.rstrip(u', ') sql = sql + u'))' self.c.execute(sql) # create sentence table sql = u'CREATE TABLE IF NOT EXISTS %s ( \ sid INTEGER PRIMARY KEY, sentence TEXT, len INTEGER)' % self.sentence_table self.c.execute(sql) # create link table sql = u'CREATE TABLE IF NOT EXISTS %s ( \ wid INTEGER, sid INTEGER, \ FOREIGN KEY(wid) REFERENCES %s(wid), \ FOREIGN KEY(sid) REFERENCES %s(sid))'\ % (self.link_table, self.freq_table, self.sentence_table) self.c.execute(sql) # create indices for faster lookup sql = 'CREATE INDEX IF NOT EXISTS freq_index ON %s (freq DESC)' % self.freq_table self.c.execute(sql) sql = 'CREATE INDEX IF NOT EXISTS len_index ON %s (len ASC)' % self.sentence_table self.c.execute(sql) jql = 'CREATE INDEX IF NOT EXISTS link_wid_index ON %s (wid ASC)' % self.link_table self.c.execute(sql) self.conn.commit() logger.out('created database tables')
def __enter__(self): logger.out('connecting to database') self.conn = sqlite3.connect(self.filename) self.c = self.conn.cursor() # Cursor for word frequency queries self.c2 = self.conn.cursor() # Cursor for sentence queries self.c3 = self.conn.cursor() # Cursor for option selections self.prepare_queries()
def __exit__(self, typ, value, traceback): self.c.close() self.c2.close() self.c3.close() self.conn.commit() self.conn.close() logger.out('disconnected from database')
def analyze(filename, formatter, parser, encoding, db): logger.out('reading %s' % filename) formatter.new_file() try: fp = codecs.open(filename, 'r', encoding) except IOError as e: logger.err('error opening %s: %s' % (filename, e)) else: with fp: # process all files line by line for line in fp: trimmed_line = formatter.trim(line) mecab_data = parser.parse(trimmed_line, db)
def main(): # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], 'hn:t:', ['help','number=','tablename=']) except getopt.error as opterr: logger.err(opterr) logger.err('for help use --help') sys.exit(2) # process config and options list_number = config.list_number tablename = config.tablename for o, a in opts: if o in ('-h', '--help'): logger.out(__doc__) sys.exit(0) if o in ('-n', '--number'): try: top_number = int(a) except ValueError: logger.err('invalid argument for top number: %s' % a) sys.exit(2) if list_number <= 0: logger.err('invalid top number: %s' % list_number) sys.exit(2) if o in ('-t', '--tablename'): tablename = a if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename): logger.err('invalid table name: %s' % tablename) sys.exit(2) # open gui with database try: db = database.Database(tablename) with db: ui = gui.FreqGUI(db, list_number) ui.show() except sqlite3.Error as e: logger.err('database error: %s' % e)
def main(): basedir = config.get_basedir() # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], 'hf:e:o:rdt:s', ['help','format=','encoding=', 'droptable', 'recursive', 'tablename=', 'sentences']) except getopt.error as opterr: logger.err(opterr) logger.err('for help use --help') sys.exit(2) # process config and options formatter = config.formatter encoding = config.encoding tablename = config.tablename drop = False recursive = False sentences = False for o, a in opts: if o in ('-h', '--help'): logger.out(__doc__) sys.exit(0) if o in ('-f', '--format'): formatter = a if formatter not in ('plain', 'aozora', 'html'): logger.err('format not supported: %s' % formatter) sys.exit(2) if o in ('-e', '--encoding'): encoding = a try: codecs.lookup(encoding) except LookupError: logger.err('encoding not found: %s' % encoding) sys.exit(2) if o in ('-d', '--droptable'): drop = True if o in ('-s', '--sentences'): sentences = True if o in ('-t', '--tablename'): tablename = a if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename): logger.err('invalid table name: %s' % tablename) sys.exit(2) if o in ('-r', '--recursive'): recursive = True # create formatter and parser if(formatter == 'aozora'): formatter = formats.AozoraFormat() elif(formatter == 'html'): formatter = formats.HTMLFormat() else: formatter = formats.Format() parser = mecab.PyMeCab(sentences) # access database try: db = database.Database(tablename) with db: if(drop): db.drop_table() db.create_table() # process files logger.out('analyzing text files') if recursive: for dirname in args: for dirpath, dirs, files in os.walk(dirname): logger.out('going through directory %s' % dirpath) for filename in files: analyze(os.path.join(dirpath, filename), formatter, parser, encoding, db) else: for filename in args: analyze(filename, formatter, parser, encoding, db) logger.out('done analyzing') except sqlite3.Error as e: logger.err('database error: %s' % e)
def clear_table(self): self.c.execute(u'DELETE FROM %s' % self.freq_table) self.c.execute(u'DELETE FROM %s' % self.sentence_table) self.c.execute(u'DELETE FROM %s' % self.link_table) self.conn.commit() logger.out('cleared database tables')
def drop_table(self): self.c.execute(u'DROP TABLE IF EXISTS %s' % self.freq_table) self.c.execute(u'DROP TABLE IF EXISTS %s' % self.sentence_table) self.c.execute(u'DROP TABLE IF EXISTS %s' % self.link_table) self.conn.commit() logger.out('dropped database tables')