def __init__(self, tablename):
   self.filename = os.path.join(config.get_basedir(), config.dbfile)
   self.freq_table = tablename + '_freqs'
   self.sentence_table = tablename + '_sentences'
   self.link_table = tablename + '_links'
   # data fields are the word and the parts of speech
   self.fields = config.mecab_fields + 1
   self.fieldnames = [u'word']
   for i in range(config.mecab_fields):
     self.fieldnames.append(u'pos' + str(i))
 def __init__(self):
   self.linecount = 0
   # load gaiji codes
   self.gaiji_codes = {}
   gaijifile = os.path.join(config.get_basedir(), config.gaijifile)
   try:
     fp = open(gaijifile, 'r')
   except IOError as e:
     logger.err('error opening gaiji codes file: %s' % (filename, e))
   else:
     with fp:
       for line in fp:
         match = re.match(ur'^0x(?P<JisCode>[0-9A-Fa-f]+)\tU\+(?P<UtfCode>[0-9a-fA-F]+)\+?(?P<UtfCode2>[0-9a-fA-F]+)?', line)
         if match:
           gaiji_code = int(match.group('JisCode'), 16)
           utf_char = unichr(int(match.group('UtfCode'), 16))
           if match.group('UtfCode2'): # 2-character representation
             utf_char = utf_char + unichr(int(match.group('UtfCode2'), 16))
           self.gaiji_codes[gaiji_code] = utf_char
def main():
  basedir = config.get_basedir()
  # parse command line options
  try:
    opts, args = getopt.getopt(sys.argv[1:], 'hf:e:o:rdt:s', ['help','format=','encoding=', 'droptable', 'recursive', 'tablename=', 'sentences'])
  except getopt.error as opterr:
    logger.err(opterr)
    logger.err('for help use --help')
    sys.exit(2)
  # process config and options
  formatter = config.formatter
  encoding = config.encoding
  tablename = config.tablename
  drop = False
  recursive = False
  sentences = False
  for o, a in opts:
    if o in ('-h', '--help'):
      logger.out(__doc__)
      sys.exit(0)
    if o in ('-f', '--format'):
      formatter = a
      if formatter not in ('plain', 'aozora', 'html'):
        logger.err('format not supported: %s' % formatter)
        sys.exit(2)
    if o in ('-e', '--encoding'):
      encoding = a
      try:
        codecs.lookup(encoding)
      except LookupError:
        logger.err('encoding not found: %s' % encoding)
        sys.exit(2)
    if o in ('-d', '--droptable'):
      drop = True
    if o in ('-s', '--sentences'):
      sentences = True
    if o in ('-t', '--tablename'):
      tablename = a
      if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename):
        logger.err('invalid table name: %s' % tablename)
        sys.exit(2)
    if o in ('-r', '--recursive'):
      recursive = True
  # create formatter and parser
  if(formatter == 'aozora'):
    formatter = formats.AozoraFormat()
  elif(formatter == 'html'):
    formatter = formats.HTMLFormat()
  else:
    formatter = formats.Format()
  parser = mecab.PyMeCab(sentences)
  # access database
  try:
    db = database.Database(tablename)
    with db:
      if(drop):
        db.drop_table()
      db.create_table()
      # process files
      logger.out('analyzing text files')
      if recursive:
        for dirname in args:
          for dirpath, dirs, files in os.walk(dirname):
            logger.out('going through directory %s' % dirpath)
            for filename in files:
              analyze(os.path.join(dirpath, filename), formatter, parser, encoding, db)
      else:
        for filename in args:
          analyze(filename, formatter, parser, encoding, db)
      logger.out('done analyzing')
  except sqlite3.Error as e:
    logger.err('database error: %s' % e)