def p_entr_1(p): '''entr : preentr''' p.lexer.begin('INITIAL') e = p[1] # The Freq objects on the readings are inependent of # those on the kanjis. The following function merges # common values. merge_freqs (e) # Set the foreign key ids since they will be used # needed by mk_restrs() below. jdb.setkeys (e, None) # The reading and sense restrictions here are simple # lists of text strings that give the allowed readings # or kanji. mk_restrs() converts those to the canonical # format which uses the index number of the disallowed # readings or kanji. if hasattr (e, '_rdng') and hasattr (e, '_kanj'): err = mk_restrs ("_RESTR", e._rdng, e._kanj) if err: perror (p, err, loc=False) if hasattr (e, '_sens') and hasattr (e, '_kanj'): err = mk_restrs ("_STAGK", e._sens, e._kanj) if err: perror (p, err, loc=False) if hasattr (e, '_sens') and hasattr (e, '_rdng'): err = mk_restrs ("_STAGR", e._sens, e._rdng) if err: perror (p, err, loc=False) # Note that the entry object returned may have an _XREF list # on its senses but the supplied xref records are not # complete. We do not assume database access is available # when parsing so we cannot look up the xrefs to find the # the target entry id numbers, validate that the kanji # reading (if given) are unique, or the target senses exist, # etc. It is expected that the caller will do this resolution # on the xrefs using something like jdb.resolv_xref() prior # to using the object. p[0] = e
def main (args, opts): global msg global Opts; Opts = opts global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir()) # Create a globally accessible function, msg() that has # has 'logfile' and 'opts.verbose' already bound and # which will be called elsewhere when there is a need to # write a message to the logfile. logfile = sys.stderr if opts.logfile: logfile = open (opts.logfile, "w", encoding=opts.encoding) def msg (message): _msg (logfile, opts.verbose, message) fin = ABPairReader (args[0], encoding='utf-8') # FIXME: following gives localtime, change to utc or lt+tz. mtime = datetime.date.fromtimestamp(os.stat(args[0])[8]) corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id) tmpfiles = pgi.initialize (opts.tempdir) if not opts.noaction: tmpfiles = pgi.initialize (opts.tempdir) if corprec: pgi.wrcorp (corprec, tmpfiles) for eid, entr in enumerate (parse_ex (fin, opts.begin)): if not opts.noaction: entr.src = corpid jdb.setkeys (entr, eid+1) pgi.wrentr (entr, tmpfiles) if not (eid % 2000): sys.stdout.write ('.'); sys.stdout.flush() if opts.count and eid+1 >= opts.count: break sys.stdout.write ('\n') if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
def test_000030(_): # Delete rdng 1. pentr, entr = _.getpair(3000010) del entr._rdng[0] jdb.setkeys(entr) res = realign_xrers(entr, pentr) _.assertEqual(res, [Xref(3000020, 1, 1, 3, 3000010, 1, 1, None, None)]) _.assertEqual(entr._sens[0]._xrer, [])
def test_000120(_): # Delete kanj 1. pentr, entr = _.getpair(3000050) del entr._kanj[0] jdb.setkeys(entr) res = realign_xrers(entr, pentr) _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 1, None)]) _.assertEqual(entr._sens[0]._xrer, [])
def test_000020(_): # Swap rdng 1 and rdng 2 pentr, entr = _.getpair(3000010) entr._rdng = entr._rdng[::-1] jdb.setkeys(entr) res = realign_xrers(entr, pentr) _.assertEqual(res, []) _.assertEqual(entr._sens[0]._xrer, [Xref(3000020, 1, 1, 3, 3000010, 1, 2, None, None)])
def test_000090(_): # Swap kanj's. pentr, entr = _.getpair(3000050) entr._kanj = entr._kanj[::-1] jdb.setkeys(entr) res = realign_xrers(entr, pentr) _.assertEqual(res, []) _.assertEqual(entr._sens[0]._xrer, [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 2, None)])
def main(args, opts): global KW if opts.database: jdb.dbOpen(opts.database, **jdb.dbopts(opts)) KW = jdb.KW else: jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir()) xlang = None if opts.lang: xlang = [KW.LANG[x].id for x in opts.lang.split(',')] #FIXME: we open the xml file with utf-8 encoding even though # its encoding may be given within the file and may be different. inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8')) tmpfiles = pgi.initialize(opts.tempdir) if not opts.logfile: logfile = sys.stderr else: logfile = open(opts.logfile, "w", encoding=opts.encoding) eid = 0 jmparser = jmxml.Jmparser(KW, logfile=logfile) for typ, entr in jmparser.parse_xmlfile(inpf, opts.begin, opts.count, opts.extract, xlang, toptag=True, seqnum_init=opts.sequence[0], seqnum_incr=opts.sequence[1]): if typ == 'entry': eid += 1 if not ((eid - 1) % 1800): sys.stdout.write('.') sys.stdout.flush() logfile.flush() if not getattr(entr, 'src', None): entr.src = corpid jdb.setkeys(entr, eid) pgi.wrentr(entr, tmpfiles) elif typ == 'corpus': pgi.wrcorp(entr, tmpfiles) elif typ == 'grpdef': pgi.wrgrpdef(entr, tmpfiles) elif typ == 'root': # Note that 'entr' here is actually the tag name of the # top-level element in the xml file, typically either # "JMdict" or "JMnedict". try: corpid, corprec \ = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW) except KeyError: pass else: if corprec: pgi.wrcorp(corprec, tmpfiles) sys.stdout.write('\n') pgi.finalize(tmpfiles, opts.output, not opts.keep)
def test_000140(_): # Swap kanj, delete rdng 1. pentr, entr = _.getpair(3000050) entr._kanj = entr._kanj[::-1] del entr._rdng[0] jdb.setkeys(entr) res = realign_xrers(entr, pentr) # Note that the kanj number was not changed from # 1 to 2 because realign() processes rdng before # kanj. (c.f. test_000130.) _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 1, 1, None)]) _.assertEqual(entr._sens[0]._xrer, [])
def test_000130(_): # Swap rdng, delete kanj 1. pentr, entr = _.getpair(3000050) entr._rdng = entr._rdng[::-1] del entr._kanj[0] jdb.setkeys(entr) res = realign_xrers(entr, pentr) # Note that the rdng number was changed from 1 to 2 # because realign() processes rdng before kanj. (c.f. # test_000140.) _.assertEqual(res, [Xref(3000060, 1, 1, 3, 3000050, 1, 2, 1, None)]) _.assertEqual(entr._sens[0]._xrer, [])
def parse_xmlfile(infn, srcid, workfiles, start, count, langs): global Lineno # Use the ElementTree module to parse the jmdict # xml file. This function keeps track of where # we are and for each parsed <entry> element, calls # do_entry() to actually build a runtime representation # of the entry, and then write_entry() to do the actual # writing to the database. inpf = LnFile(open(infn, encoding='utf-8')) context = iter(ElementTree.iterparse(inpf, ("start", "end"))) event, root = next(context) if start and start > 1: print("Skipping initial entries...", file=sys.stderr) cntr = 0 for event, elem in context: # We get here every time a tag is opened (event # will be "start") or closed (event will be "end") # "elem" is an object containg the element which # will be empty when event is "start" and will contain # all the element's attributes and child elements # when event is "end". elem.tag is the name of the # tag. if elem.tag == "character" and event == "start": # When we encounter a <character> tag, save the line # number, and increment the entry counter "cntr". Lineno = inpf.lineno # For warning messages created by warn(). # If we are skipping entries, cntr will be 0. # Otherwise, break if we have processed the # the number of entries requested in the -c # option. if cntr >= count: break if elem.tag == 'header' and event == 'end': xmldate = (elem.find('date_of_creation')).text if (elem.find ('file_version')).text != '4' or \ (elem.find ('database_version')).text != KANJIDIC_VERSION: warn('Kanjidic XML version is %s but we expected %s.' '\nThis program may or may not work on this file.' % (elem.find('database_version').text, KANJIDIC_VERSION)) # Otherwise we are precessing characters so we want # to handle the <character> "end" events but we are # not interested in anything else. if elem.tag != "character" or event != "end": continue # If we haven't reached that starting line number # (given by the -b option) yet, then don't process # this entry, but we still need to clear the parsed # entry bofore continuing in order to avoid excessive # memory consumption. if Lineno >= start: # If this is the first entry processed (cnt0==0) # save the current entry counter value. cntr += 1 if cntr == 1: print("Parsing...", file=sys.stderr) # Process and write this entry. entr = do_chr(elem, srcid, langs) jdb.setkeys(entr, cntr) pgi.wrentr(entr, workfiles) # A progress bar. The modulo number is picked # to provide slightly less that 80 dots for a full # kanjidic2 file. if (cntr - 1) % 166 == 0: sys.stderr.write(".") # We no longer need the parsed xml info for this # item so dump it to reduce memory consumption. root.clear() return xmldate