Example #1
0
 def __init__(self, lookupfunc):
     dict.__init__(self)
     self.kw = jdb.Kwds(jdb.std_csv_dir())
     self.jmparser = jmxml.Jmparser(self.kw)
     self.ridx = collections.defaultdict(set)
     self.kidx = collections.defaultdict(set)
     self.lookupfunc = lookupfunc
Example #2
0
 def setUp (_):
     global KW
     if not KW:
         jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir())
       # Use mode='b' in getxml call because we need undecoded
       # utf-8 for Jmparser.parse_entry() (which gives it to
       # ElementTree which needs utf-8.)
     _.getxml = lambda testid: getxml ('data/jmxml/parse_entry.xml', testid, 'b')
     _.jmparser = jmxml.Jmparser (KW)
Example #3
0
def main(args, opts):
    global KW

    if opts.database:
        jdb.dbOpen(opts.database, **jdb.dbopts(opts))
        KW = jdb.KW
    else:
        jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    xlang = None
    if opts.lang:
        xlang = [KW.LANG[x].id for x in opts.lang.split(',')]

    #FIXME: we open the xml file with utf-8 encoding even though
    # its encoding may be given within the file and may be different.
    inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8'))
    tmpfiles = pgi.initialize(opts.tempdir)
    if not opts.logfile: logfile = sys.stderr
    else: logfile = open(opts.logfile, "w", encoding=opts.encoding)
    eid = 0
    jmparser = jmxml.Jmparser(KW, logfile=logfile)
    for typ, entr in jmparser.parse_xmlfile(inpf,
                                            opts.begin,
                                            opts.count,
                                            opts.extract,
                                            xlang,
                                            toptag=True,
                                            seqnum_init=opts.sequence[0],
                                            seqnum_incr=opts.sequence[1]):
        if typ == 'entry':
            eid += 1
            if not ((eid - 1) % 1800):
                sys.stdout.write('.')
                sys.stdout.flush()
                logfile.flush()
            if not getattr(entr, 'src', None): entr.src = corpid
            jdb.setkeys(entr, eid)
            pgi.wrentr(entr, tmpfiles)
        elif typ == 'corpus':
            pgi.wrcorp(entr, tmpfiles)
        elif typ == 'grpdef':
            pgi.wrgrpdef(entr, tmpfiles)
        elif typ == 'root':
            # Note that 'entr' here is actually the tag name of the
            # top-level element in the xml file, typically either
            # "JMdict" or "JMnedict".
            try:                corpid, corprec \
                   = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW)
            except KeyError:
                pass
            else:
                if corprec: pgi.wrcorp(corprec, tmpfiles)

    sys.stdout.write('\n')
    pgi.finalize(tmpfiles, opts.output, not opts.keep)
Example #4
0
def globalSetup():
    global Cur, KW, Lexer, Parser, Jmparser
    if Cur: return False
    try:
        import dbauth
        kwargs = dbauth.auth
    except ImportError:
        kwargs = {'database': 'jmdict'}
    kwargs['autocommit'] = True
    Cur = jdb.dbOpen(None, **kwargs)
    KW = jdb.KW
    Lexer, tokens = jellex.create_lexer()
    Parser = jelparse.create_parser(Lexer, tokens)
    Jmparser = jmxml.Jmparser(KW)
    return True