Example #1
0
def main(args, opts):
    if sys.stdout.encoding != opts.encoding:
        sys.stdout = open(sys.stdout.fileno(), 'w', encoding=opts.encoding)
    jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())
    seqlist = []
    first = True
    infn = args.pop(0)
    if opts.seqfile:
        seqlist = parse_seqfile(opts.seqfile)
    else:
        for arg in args:
            seq, x, cnt = arg.partition(',')
            seqlist.append((int(seq), int(cnt or 1)))
    fin = open(infn, encoding="utf_8_sig")
    if seqlist:
        for seq, entr in jmxml.extract(fin, seqlist, opts.dtd, opts.all):
            print(seq, file=sys.stderr)
            if opts.dtd and first:
                toplev, dtd = seq, entr
                print('\n'.join(dtd))
                print("<%s>" % toplev)
                first = False
                continue
            print('\n'.join(entr))
        if opts.dtd: print(("</%s>" % toplev))
    else: print("No seq numbers!", file=sys.stderr)
Example #2
0
def main(args, opts):
    global Opts
    Opts = opts
    global Char
    Char = ''
    global Lineno
    Lineno = 1
    global KW

    jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    if opts.l: opts.l = open(opts.l, "w", encoding=opts.e)
    else: opts.l = sys.stderr
    if not opts.o:
        fn = (os.path.split(args[0]))[1]
        fn = (os.path.splitext(fn))[0]
        opts.o = fn + ".pgi"
    elif opts.o == "-":
        opts.o = None
    if opts.g:
        langs = [KW.LANG[iso639_1_to_2[x]].id for x in opts.g.split(',')]
    else:
        langs = None
    workfiles = pgi.initialize(opts.t)
    srcdate = parse_xmlfile(args[0], 4, workfiles, opts.b, opts.c, langs)
    srcrec = jdb.Obj(id=4,
                     kw='kanjidic',
                     descr='kanjidic2.xml',
                     dt=srcdate,
                     seq='seq_kanjidic',
                     srct=KW.SRCT['kanjidic'].id)
    pgi.wrcorp(srcrec, workfiles)
    pgi.finalize(workfiles, opts.o, not opts.k)
    print("\nDone!", file=sys.stderr)
Example #3
0
def main (args, opts):
        global msg
        global Opts; Opts = opts
        global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir())

          # Create a globally accessible function, msg() that has
          # has 'logfile' and 'opts.verbose' already bound and
          # which will be called elsewhere when there is a need to
          # write a message to the logfile.
        logfile = sys.stderr
        if opts.logfile:
            logfile = open (opts.logfile, "w", encoding=opts.encoding)
        def msg (message): _msg (logfile, opts.verbose, message)

        fin = ABPairReader (args[0], encoding='utf-8')
          # FIXME: following gives localtime, change to utc or lt+tz.
        mtime = datetime.date.fromtimestamp(os.stat(args[0])[8])
        corpid, corprec \
            = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id)
        tmpfiles = pgi.initialize (opts.tempdir)
        if not opts.noaction:
            tmpfiles = pgi.initialize (opts.tempdir)
            if corprec: pgi.wrcorp (corprec, tmpfiles)
        for eid, entr in enumerate (parse_ex (fin, opts.begin)):
            if not opts.noaction:
                entr.src = corpid
                jdb.setkeys (entr, eid+1)
                pgi.wrentr (entr, tmpfiles)
            if not (eid % 2000):
                sys.stdout.write ('.'); sys.stdout.flush()
            if opts.count and eid+1 >= opts.count: break
        sys.stdout.write ('\n')
        if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
Example #4
0
 def __init__(self, lookupfunc):
     dict.__init__(self)
     self.kw = jdb.Kwds(jdb.std_csv_dir())
     self.jmparser = jmxml.Jmparser(self.kw)
     self.ridx = collections.defaultdict(set)
     self.kidx = collections.defaultdict(set)
     self.lookupfunc = lookupfunc
Example #5
0
def global_setup(loadname):
    global Loaded, Test_indata, Test_expdata
    if Loaded != loadname:
        jdb.KW = jdb.Kwds(jdb.std_csv_dir())
        Test_indata = readedict('data/edparse/%s.txt' % loadname)
        Test_expdata = readxml('data/edparse/%s.xml' % loadname)
        Loaded = loadname
    return
Example #6
0
 def setUp (_):
     global KW
     if not KW:
         jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir())
       # Use mode='b' in getxml call because we need undecoded
       # utf-8 for Jmparser.parse_entry() (which gives it to
       # ElementTree which needs utf-8.)
     _.getxml = lambda testid: getxml ('data/jmxml/parse_entry.xml', testid, 'b')
     _.jmparser = jmxml.Jmparser (KW)
Example #7
0
def main(args, opts):
    global KW

    if opts.database:
        jdb.dbOpen(opts.database, **jdb.dbopts(opts))
        KW = jdb.KW
    else:
        jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    xlang = None
    if opts.lang:
        xlang = [KW.LANG[x].id for x in opts.lang.split(',')]

    #FIXME: we open the xml file with utf-8 encoding even though
    # its encoding may be given within the file and may be different.
    inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8'))
    tmpfiles = pgi.initialize(opts.tempdir)
    if not opts.logfile: logfile = sys.stderr
    else: logfile = open(opts.logfile, "w", encoding=opts.encoding)
    eid = 0
    jmparser = jmxml.Jmparser(KW, logfile=logfile)
    for typ, entr in jmparser.parse_xmlfile(inpf,
                                            opts.begin,
                                            opts.count,
                                            opts.extract,
                                            xlang,
                                            toptag=True,
                                            seqnum_init=opts.sequence[0],
                                            seqnum_incr=opts.sequence[1]):
        if typ == 'entry':
            eid += 1
            if not ((eid - 1) % 1800):
                sys.stdout.write('.')
                sys.stdout.flush()
                logfile.flush()
            if not getattr(entr, 'src', None): entr.src = corpid
            jdb.setkeys(entr, eid)
            pgi.wrentr(entr, tmpfiles)
        elif typ == 'corpus':
            pgi.wrcorp(entr, tmpfiles)
        elif typ == 'grpdef':
            pgi.wrgrpdef(entr, tmpfiles)
        elif typ == 'root':
            # Note that 'entr' here is actually the tag name of the
            # top-level element in the xml file, typically either
            # "JMdict" or "JMnedict".
            try:                corpid, corprec \
                   = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW)
            except KeyError:
                pass
            else:
                if corprec: pgi.wrcorp(corprec, tmpfiles)

    sys.stdout.write('\n')
    pgi.finalize(tmpfiles, opts.output, not opts.keep)
Example #8
0
 def setUp(_):
     global Test_actual_csv_test_object
     if Test_actual_csv_test_object is None:
         Test_actual_csv_test_object = jdb.Kwds(jdb.std_csv_dir())
     _.o = Test_actual_csv_test_object
Example #9
0
def main():
        jdb.KW = jdb.Kwds (jdb.std_csv_dir())
        lexer, tokens = jellex.create_lexer (debug=0>>8)
        jelparse.create_parser (lexer, tokens, module=jelparse,
                                tabmodule='jelparse_tab',
                                write_tables=1, optimize=0, debug=1)