def main(args, opts): if sys.stdout.encoding != opts.encoding: sys.stdout = open(sys.stdout.fileno(), 'w', encoding=opts.encoding) jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) seqlist = [] first = True infn = args.pop(0) if opts.seqfile: seqlist = parse_seqfile(opts.seqfile) else: for arg in args: seq, x, cnt = arg.partition(',') seqlist.append((int(seq), int(cnt or 1))) fin = open(infn, encoding="utf_8_sig") if seqlist: for seq, entr in jmxml.extract(fin, seqlist, opts.dtd, opts.all): print(seq, file=sys.stderr) if opts.dtd and first: toplev, dtd = seq, entr print('\n'.join(dtd)) print("<%s>" % toplev) first = False continue print('\n'.join(entr)) if opts.dtd: print(("</%s>" % toplev)) else: print("No seq numbers!", file=sys.stderr)
def main(args, opts): global Opts Opts = opts global Char Char = '' global Lineno Lineno = 1 global KW jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) if opts.l: opts.l = open(opts.l, "w", encoding=opts.e) else: opts.l = sys.stderr if not opts.o: fn = (os.path.split(args[0]))[1] fn = (os.path.splitext(fn))[0] opts.o = fn + ".pgi" elif opts.o == "-": opts.o = None if opts.g: langs = [KW.LANG[iso639_1_to_2[x]].id for x in opts.g.split(',')] else: langs = None workfiles = pgi.initialize(opts.t) srcdate = parse_xmlfile(args[0], 4, workfiles, opts.b, opts.c, langs) srcrec = jdb.Obj(id=4, kw='kanjidic', descr='kanjidic2.xml', dt=srcdate, seq='seq_kanjidic', srct=KW.SRCT['kanjidic'].id) pgi.wrcorp(srcrec, workfiles) pgi.finalize(workfiles, opts.o, not opts.k) print("\nDone!", file=sys.stderr)
def main (args, opts): global msg global Opts; Opts = opts global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir()) # Create a globally accessible function, msg() that has # has 'logfile' and 'opts.verbose' already bound and # which will be called elsewhere when there is a need to # write a message to the logfile. logfile = sys.stderr if opts.logfile: logfile = open (opts.logfile, "w", encoding=opts.encoding) def msg (message): _msg (logfile, opts.verbose, message) fin = ABPairReader (args[0], encoding='utf-8') # FIXME: following gives localtime, change to utc or lt+tz. mtime = datetime.date.fromtimestamp(os.stat(args[0])[8]) corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id) tmpfiles = pgi.initialize (opts.tempdir) if not opts.noaction: tmpfiles = pgi.initialize (opts.tempdir) if corprec: pgi.wrcorp (corprec, tmpfiles) for eid, entr in enumerate (parse_ex (fin, opts.begin)): if not opts.noaction: entr.src = corpid jdb.setkeys (entr, eid+1) pgi.wrentr (entr, tmpfiles) if not (eid % 2000): sys.stdout.write ('.'); sys.stdout.flush() if opts.count and eid+1 >= opts.count: break sys.stdout.write ('\n') if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
def __init__(self, lookupfunc): dict.__init__(self) self.kw = jdb.Kwds(jdb.std_csv_dir()) self.jmparser = jmxml.Jmparser(self.kw) self.ridx = collections.defaultdict(set) self.kidx = collections.defaultdict(set) self.lookupfunc = lookupfunc
def global_setup(loadname): global Loaded, Test_indata, Test_expdata if Loaded != loadname: jdb.KW = jdb.Kwds(jdb.std_csv_dir()) Test_indata = readedict('data/edparse/%s.txt' % loadname) Test_expdata = readxml('data/edparse/%s.xml' % loadname) Loaded = loadname return
def setUp (_): global KW if not KW: jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir()) # Use mode='b' in getxml call because we need undecoded # utf-8 for Jmparser.parse_entry() (which gives it to # ElementTree which needs utf-8.) _.getxml = lambda testid: getxml ('data/jmxml/parse_entry.xml', testid, 'b') _.jmparser = jmxml.Jmparser (KW)
def main(args, opts): global KW if opts.database: jdb.dbOpen(opts.database, **jdb.dbopts(opts)) KW = jdb.KW else: jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) xlang = None if opts.lang: xlang = [KW.LANG[x].id for x in opts.lang.split(',')] #FIXME: we open the xml file with utf-8 encoding even though # its encoding may be given within the file and may be different. inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8')) tmpfiles = pgi.initialize(opts.tempdir) if not opts.logfile: logfile = sys.stderr else: logfile = open(opts.logfile, "w", encoding=opts.encoding) eid = 0 jmparser = jmxml.Jmparser(KW, logfile=logfile) for typ, entr in jmparser.parse_xmlfile(inpf, opts.begin, opts.count, opts.extract, xlang, toptag=True, seqnum_init=opts.sequence[0], seqnum_incr=opts.sequence[1]): if typ == 'entry': eid += 1 if not ((eid - 1) % 1800): sys.stdout.write('.') sys.stdout.flush() logfile.flush() if not getattr(entr, 'src', None): entr.src = corpid jdb.setkeys(entr, eid) pgi.wrentr(entr, tmpfiles) elif typ == 'corpus': pgi.wrcorp(entr, tmpfiles) elif typ == 'grpdef': pgi.wrgrpdef(entr, tmpfiles) elif typ == 'root': # Note that 'entr' here is actually the tag name of the # top-level element in the xml file, typically either # "JMdict" or "JMnedict". try: corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW) except KeyError: pass else: if corprec: pgi.wrcorp(corprec, tmpfiles) sys.stdout.write('\n') pgi.finalize(tmpfiles, opts.output, not opts.keep)
def setUp(_): global Test_actual_csv_test_object if Test_actual_csv_test_object is None: Test_actual_csv_test_object = jdb.Kwds(jdb.std_csv_dir()) _.o = Test_actual_csv_test_object
def main(): jdb.KW = jdb.Kwds (jdb.std_csv_dir()) lexer, tokens = jellex.create_lexer (debug=0>>8) jelparse.create_parser (lexer, tokens, module=jelparse, tabmodule='jelparse_tab', write_tables=1, optimize=0, debug=1)