def main (args, opts): global msg global Opts; Opts = opts global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir()) # Create a globally accessible function, msg() that has # has 'logfile' and 'opts.verbose' already bound and # which will be called elsewhere when there is a need to # write a message to the logfile. logfile = sys.stderr if opts.logfile: logfile = open (opts.logfile, "w", encoding=opts.encoding) def msg (message): _msg (logfile, opts.verbose, message) fin = ABPairReader (args[0], encoding='utf-8') # FIXME: following gives localtime, change to utc or lt+tz. mtime = datetime.date.fromtimestamp(os.stat(args[0])[8]) corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id) tmpfiles = pgi.initialize (opts.tempdir) if not opts.noaction: tmpfiles = pgi.initialize (opts.tempdir) if corprec: pgi.wrcorp (corprec, tmpfiles) for eid, entr in enumerate (parse_ex (fin, opts.begin)): if not opts.noaction: entr.src = corpid jdb.setkeys (entr, eid+1) pgi.wrentr (entr, tmpfiles) if not (eid % 2000): sys.stdout.write ('.'); sys.stdout.flush() if opts.count and eid+1 >= opts.count: break sys.stdout.write ('\n') if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
def main(args, opts): global KW if opts.database: jdb.dbOpen(opts.database, **jdb.dbopts(opts)) KW = jdb.KW else: jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) xlang = None if opts.lang: xlang = [KW.LANG[x].id for x in opts.lang.split(',')] #FIXME: we open the xml file with utf-8 encoding even though # its encoding may be given within the file and may be different. inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8')) tmpfiles = pgi.initialize(opts.tempdir) if not opts.logfile: logfile = sys.stderr else: logfile = open(opts.logfile, "w", encoding=opts.encoding) eid = 0 jmparser = jmxml.Jmparser(KW, logfile=logfile) for typ, entr in jmparser.parse_xmlfile(inpf, opts.begin, opts.count, opts.extract, xlang, toptag=True, seqnum_init=opts.sequence[0], seqnum_incr=opts.sequence[1]): if typ == 'entry': eid += 1 if not ((eid - 1) % 1800): sys.stdout.write('.') sys.stdout.flush() logfile.flush() if not getattr(entr, 'src', None): entr.src = corpid jdb.setkeys(entr, eid) pgi.wrentr(entr, tmpfiles) elif typ == 'corpus': pgi.wrcorp(entr, tmpfiles) elif typ == 'grpdef': pgi.wrgrpdef(entr, tmpfiles) elif typ == 'root': # Note that 'entr' here is actually the tag name of the # top-level element in the xml file, typically either # "JMdict" or "JMnedict". try: corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW) except KeyError: pass else: if corprec: pgi.wrcorp(corprec, tmpfiles) sys.stdout.write('\n') pgi.finalize(tmpfiles, opts.output, not opts.keep)
def parse_xmlfile(infn, srcid, workfiles, start, count, langs): global Lineno # Use the ElementTree module to parse the jmdict # xml file. This function keeps track of where # we are and for each parsed <entry> element, calls # do_entry() to actually build a runtime representation # of the entry, and then write_entry() to do the actual # writing to the database. inpf = LnFile(open(infn, encoding='utf-8')) context = iter(ElementTree.iterparse(inpf, ("start", "end"))) event, root = next(context) if start and start > 1: print("Skipping initial entries...", file=sys.stderr) cntr = 0 for event, elem in context: # We get here every time a tag is opened (event # will be "start") or closed (event will be "end") # "elem" is an object containg the element which # will be empty when event is "start" and will contain # all the element's attributes and child elements # when event is "end". elem.tag is the name of the # tag. if elem.tag == "character" and event == "start": # When we encounter a <character> tag, save the line # number, and increment the entry counter "cntr". Lineno = inpf.lineno # For warning messages created by warn(). # If we are skipping entries, cntr will be 0. # Otherwise, break if we have processed the # the number of entries requested in the -c # option. if cntr >= count: break if elem.tag == 'header' and event == 'end': xmldate = (elem.find('date_of_creation')).text if (elem.find ('file_version')).text != '4' or \ (elem.find ('database_version')).text != KANJIDIC_VERSION: warn('Kanjidic XML version is %s but we expected %s.' '\nThis program may or may not work on this file.' % (elem.find('database_version').text, KANJIDIC_VERSION)) # Otherwise we are precessing characters so we want # to handle the <character> "end" events but we are # not interested in anything else. if elem.tag != "character" or event != "end": continue # If we haven't reached that starting line number # (given by the -b option) yet, then don't process # this entry, but we still need to clear the parsed # entry bofore continuing in order to avoid excessive # memory consumption. if Lineno >= start: # If this is the first entry processed (cnt0==0) # save the current entry counter value. cntr += 1 if cntr == 1: print("Parsing...", file=sys.stderr) # Process and write this entry. entr = do_chr(elem, srcid, langs) jdb.setkeys(entr, cntr) pgi.wrentr(entr, workfiles) # A progress bar. The modulo number is picked # to provide slightly less that 80 dots for a full # kanjidic2 file. if (cntr - 1) % 166 == 0: sys.stderr.write(".") # We no longer need the parsed xml info for this # item so dump it to reduce memory consumption. root.clear() return xmldate