def __init__(self, lookupfunc): dict.__init__(self) self.kw = jdb.Kwds(jdb.std_csv_dir()) self.jmparser = jmxml.Jmparser(self.kw) self.ridx = collections.defaultdict(set) self.kidx = collections.defaultdict(set) self.lookupfunc = lookupfunc
def setUp (_): global KW if not KW: jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir()) # Use mode='b' in getxml call because we need undecoded # utf-8 for Jmparser.parse_entry() (which gives it to # ElementTree which needs utf-8.) _.getxml = lambda testid: getxml ('data/jmxml/parse_entry.xml', testid, 'b') _.jmparser = jmxml.Jmparser (KW)
def main(args, opts): global KW if opts.database: jdb.dbOpen(opts.database, **jdb.dbopts(opts)) KW = jdb.KW else: jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) xlang = None if opts.lang: xlang = [KW.LANG[x].id for x in opts.lang.split(',')] #FIXME: we open the xml file with utf-8 encoding even though # its encoding may be given within the file and may be different. inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8')) tmpfiles = pgi.initialize(opts.tempdir) if not opts.logfile: logfile = sys.stderr else: logfile = open(opts.logfile, "w", encoding=opts.encoding) eid = 0 jmparser = jmxml.Jmparser(KW, logfile=logfile) for typ, entr in jmparser.parse_xmlfile(inpf, opts.begin, opts.count, opts.extract, xlang, toptag=True, seqnum_init=opts.sequence[0], seqnum_incr=opts.sequence[1]): if typ == 'entry': eid += 1 if not ((eid - 1) % 1800): sys.stdout.write('.') sys.stdout.flush() logfile.flush() if not getattr(entr, 'src', None): entr.src = corpid jdb.setkeys(entr, eid) pgi.wrentr(entr, tmpfiles) elif typ == 'corpus': pgi.wrcorp(entr, tmpfiles) elif typ == 'grpdef': pgi.wrgrpdef(entr, tmpfiles) elif typ == 'root': # Note that 'entr' here is actually the tag name of the # top-level element in the xml file, typically either # "JMdict" or "JMnedict". try: corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW) except KeyError: pass else: if corprec: pgi.wrcorp(corprec, tmpfiles) sys.stdout.write('\n') pgi.finalize(tmpfiles, opts.output, not opts.keep)
def globalSetup(): global Cur, KW, Lexer, Parser, Jmparser if Cur: return False try: import dbauth kwargs = dbauth.auth except ImportError: kwargs = {'database': 'jmdict'} kwargs['autocommit'] = True Cur = jdb.dbOpen(None, **kwargs) KW = jdb.KW Lexer, tokens = jellex.create_lexer() Parser = jelparse.create_parser(Lexer, tokens) Jmparser = jmxml.Jmparser(KW) return True