def __init__(self, tablename): self.filename = os.path.join(config.get_basedir(), config.dbfile) self.freq_table = tablename + '_freqs' self.sentence_table = tablename + '_sentences' self.link_table = tablename + '_links' # data fields are the word and the parts of speech self.fields = config.mecab_fields + 1 self.fieldnames = [u'word'] for i in range(config.mecab_fields): self.fieldnames.append(u'pos' + str(i))
def __init__(self): self.linecount = 0 # load gaiji codes self.gaiji_codes = {} gaijifile = os.path.join(config.get_basedir(), config.gaijifile) try: fp = open(gaijifile, 'r') except IOError as e: logger.err('error opening gaiji codes file: %s' % (filename, e)) else: with fp: for line in fp: match = re.match(ur'^0x(?P<JisCode>[0-9A-Fa-f]+)\tU\+(?P<UtfCode>[0-9a-fA-F]+)\+?(?P<UtfCode2>[0-9a-fA-F]+)?', line) if match: gaiji_code = int(match.group('JisCode'), 16) utf_char = unichr(int(match.group('UtfCode'), 16)) if match.group('UtfCode2'): # 2-character representation utf_char = utf_char + unichr(int(match.group('UtfCode2'), 16)) self.gaiji_codes[gaiji_code] = utf_char
def main(): basedir = config.get_basedir() # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], 'hf:e:o:rdt:s', ['help','format=','encoding=', 'droptable', 'recursive', 'tablename=', 'sentences']) except getopt.error as opterr: logger.err(opterr) logger.err('for help use --help') sys.exit(2) # process config and options formatter = config.formatter encoding = config.encoding tablename = config.tablename drop = False recursive = False sentences = False for o, a in opts: if o in ('-h', '--help'): logger.out(__doc__) sys.exit(0) if o in ('-f', '--format'): formatter = a if formatter not in ('plain', 'aozora', 'html'): logger.err('format not supported: %s' % formatter) sys.exit(2) if o in ('-e', '--encoding'): encoding = a try: codecs.lookup(encoding) except LookupError: logger.err('encoding not found: %s' % encoding) sys.exit(2) if o in ('-d', '--droptable'): drop = True if o in ('-s', '--sentences'): sentences = True if o in ('-t', '--tablename'): tablename = a if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename): logger.err('invalid table name: %s' % tablename) sys.exit(2) if o in ('-r', '--recursive'): recursive = True # create formatter and parser if(formatter == 'aozora'): formatter = formats.AozoraFormat() elif(formatter == 'html'): formatter = formats.HTMLFormat() else: formatter = formats.Format() parser = mecab.PyMeCab(sentences) # access database try: db = database.Database(tablename) with db: if(drop): db.drop_table() db.create_table() # process files logger.out('analyzing text files') if recursive: for dirname in args: for dirpath, dirs, files in os.walk(dirname): logger.out('going through directory %s' % dirpath) for filename in files: analyze(os.path.join(dirpath, filename), formatter, parser, encoding, db) else: for filename in args: analyze(filename, formatter, parser, encoding, db) logger.out('done analyzing') except sqlite3.Error as e: logger.err('database error: %s' % e)