def rmgroup(rmg, langs=None): rdngs = [] glosses = [] cinf = [] dupchk = {} for x in rmg.findall('reading'): rtype = None rstat = None cinfrec = None for aname, aval in list(x.items()): if aname == 'r_type': rtype = aval if aname == 'on_type': rtype = aval if aname == 'r_status': rstat = aval if rtype in ('pinyin', 'korean_r', 'korean_h', 'vietnam'): if (rtype, x.text) in dupchk: warn("Duplicate reading ignored: %s, %s" % (rtype, x.text)) continue dupchk[(rtype, x.text)] = True cinf.append(jdb.Cinf(kw=KW.CINF[rtype].id, value=x.text)) elif rtype == 'ja_on' or rtype == 'ja_kun': if x.text in dupchk: warn('Duplicate reading ignored: %s' % x.text) continue dupchk[x.text] = True rdng = jdb.Rdng(txt=x.text, _inf=[]) rdng._inf.append( jdb.Rinf(kw=KW.RINF[Xml2db.RINF.get(aval, aval)].id)) if rstat: rdng._inf.append( jdb.Rinf(kw=KW.RINF[Xml2db.RINF.get(rstat, rstat)].id)) rdngs.append(rdng) else: raise KeyError('Unkown r_type attribute: %s' % rtype) dupchk = {} for x in rmg.findall('meaning'): lang = x.get('m_lang', 'en') langkw = KW.LANG[Xml2db.LANG.get(lang, lang)].id if (lang, x.text) in dupchk: warn("Duplicate lang,meaning pair ignored: %s:%s" % (lang, x.text)) continue dupchk[(lang, x.text)] = True if not langs or langkw in langs: glosses.append(jdb.Gloss(txt=x.text, lang=langkw, ginf=1)) return rdngs, glosses, cinf
def bld_rdng(r, taglist=[]): errs = [] nokanj = False for t in taglist: typ = t.pop(0) if typ is None: v = lookup_tag(t[0], ('RINF', 'FREQ')) if not v: typ = None errs.append("Unknown reading tag '%s'" % t[0]) else: typ, t = v[0][0], v[0][1:] if typ == 'RINF': append(r, '_inf', jdb.Rinf(kw=t[0])) elif typ == 'FREQ': # _freq objects are referenced by both the reading and # kanji _freq lists. Since we don't have access to # the kanj here, temporarily save the freq (kw, value) # tuple in attribute "._FREQ". When the full entry is # processed, the info in here will be removed, merged # with parallel info from the kanj objects, and proper # ._freq objects created. append(r, '_FREQ', (t[0], t[1])) elif typ == 'RESTR': # We can't generate real restr records here because the real # records are the disallowed kanji. We have the allowed # kanji here and need the set of all kanji in order to get # the disallowed set, and we don't have that now. So we # just save the allowed kanji as given, and will convert it # after the full entry is built and we have all the info we # need. #for xitem in t[0]: # An xitem represents a reference to another entry # or other info within an entry, in textual form. It # is used for xrefs and restr info. It is a 5-seq # with the following values: # [0] -- Reading text # [1] -- Kanji text # For a reading restr, it is expected to contain only # a kanji text. rtxt, ktxt = t if rtxt == "nokanji": nokanj = True r._NOKANJI = 1 continue if rtxt: errs.append("Reading restrictions must be kanji only: " + rtxt) append(r, "_RESTR", ktxt) if hasattr(r, '_RESTR') and nokanj: errs.append( "Can't use both kanji and \"nokanji\" in 'restr' tags") elif typ: errs.append("Cannot use '%s' tag in a reading" % typ) return "\n".join(errs)
def reading_meaning(rm, rdng, sens, cinf, langs): KW_NANORI = KW.RINF[Xml2db.RINF.get('nanori', 'nanori')].id for x in rm.findall('rmgroup'): r, g, c = rmgroup(x, langs) rdng.extend(r) sens.append(jdb.Sens(_gloss=g)) # Make a dict keyed by the readings already parsed. rlookup = dict([(r.txt, r) for r in rdng]) # Get the nanori readings... for x in rm.findall('nanori'): # There may be nanori readings that are the same as # the on/kun readings we've already parsed. Lookup # the nanori reading in the readings dict. If we # already have the reading, just add the nanori RINF # tag to it. Otherwise create a new reading record. try: # Check if reading has already been seen. r = rlookup[x.text] # It has. See if it occured as a nanori reading. wasnanori = False for i in getattr(r, '_inf', []): if i.kw == KW_NANORI: wasnanori = True break # It occured previously as a nanori reading so this # instance must be a duplicate. if wasnanori: warn('Duplicate nanori reading: "%s"' % x.text) continue # At this point, the nanori reading occured previously # but as a jp-on or jp-kun reading. 'r' is set to # that previous reading, and we will (below) just # add a nanori tag to 'r'. except KeyError: # This nanori reading has not been seen before. # Create a new Rdng object for it. r = jdb.Rdng(txt=x.text) rdng.append(r) # Add it to the previously seen readings dict. rlookup[r.txt] = r if not hasattr(r, '_inf'): r._inf = [] r._inf.append(jdb.Rinf(kw=KW_NANORI)) cinf.extend(c)