def reformat(ktxt, rtxt, stxt, entr): # Given edict2-formatted kanji, reading, and sense # strings, try to convert them into jmdictdb objects, # and then format them back to JEL-formatted strings # which are returned. If unable to parse an input # string, return the unparsed string prefixed with # "!unparsed!" instead of the JEL-formatted string. # If matching kanji or reading items exist on 'entr' # have and kinf, rinf, freq, or restrs, those items # are added to the JEL-formated string. failed = False kanjs = rdngs = senss = None fmap = {} # Assume the worst and overwrite the following if # things work ok... jktxt = "!unparsed!\n" + ktxt jrtxt = "!unparsed!\n" + rtxt jstxt = "!unparsed!\n" + stxt try: kanjs = edparse.parse_krpart(ktxt, fmap) except eParseError as excep: try: print("reformat kanj failed: %s" % (str(excep))) except UnicodeError: "reformat kanj failed: (unprintable exception)" if kanjs is not None: # kanjs is None if kanji parse failed in try: # which case we can't parse readings or senses. rdngs = edparse.parse_krpart(rtxt, fmap, kanjs) except eParseError as excep: try: print("reformat rdng failed: %s" % (str(excep))) except UnicodeError: "reformat rdng failed: (unprintable exception)" if rdngs is not None: # rdngs is None if reading parse failed in if entr: # which case we can't parse senses. # The wwwjdic submission data does not apparently # include tags from the orignal entry so we copy # them here. copy_tags(entr._rdng, entr._kanj, rdngs, kanjs) e = jdb.Entr(_rdng=rdngs, _kanj=kanjs) try: edparse.parse_spart(stxt, e, fmap) senss = e._sens jktxt = fmtjel.kanjs(kanjs) jrtxt = fmtjel.rdngs(rdngs, kanjs) jstxt = fmtjel.senss(senss, kanjs, rdngs) except eParseError as excep: try: print("reformat sens failed: %s" % (str(excep))) except UnicodeError: "reformat sens failed: (unprintable exception)" return jktxt, jrtxt, jstxt
def create_entr(cursor, parsed): # From the dictionary of wwwjdict submission values in # 'parsed' we create the same kind of data that cgi/edform.py # creates internally to send to the edform.tal template: an # Entr object with some attached extra data. This object is # returned to caller (who will serialize it and write it to # a file). if parsed['subtype'] == 'new': entr = jdb.Entr() entr.src = jdb.KW.SRC['jmdict'].id else: # == 'amend' seqnum = parsed['seqnum'] errs = [] # FIXME: following assumes seqnum is an entry in jmdict. entrs = jmcgi.get_entrs(cursor, None, [seqnum], errs, active=True, corpus='jmdict') if errs: print('\n'.join(errs)) if entrs: entr = entrs[0] else: raise ParseError("Unable to get entry seq# %s from database" % seqnum) kanj = [] rdng = [] gloss = [] for x in parsed.get('headw', []): if jdb.jstr_reb(x): rdng.append(x) else: kanj.append(x) rdng.extend(parsed.get('kana', [])) ktxt = ';'.join(kanj) rtxt = ';'.join(rdng) stxt = ' / '.join(parsed.get('english', [])) pos = ','.join(parsed.get('pos', [])) misc = ','.join(parsed.get('misc', [])) xref = ','.join(parsed.get('crossref', [])) #FIXME: Note that including pos, xref. et.al. can break # a sense parse that would otherwise be ok. Maybe if the # parse fails, we should try again without this stuff, # and if that works, append this stuff as "unparsable"- # tagged extra text. # However, senses other than the first may have this # information embedded in the text and it seems a bit # much to try pulling it out... stxt = (('('+pos+')') if pos else '') \ + (('(See '+xref+')') if xref else '') \ + (('('+misc+')') if misc else '') \ + (' ' if pos or misc or xref else '') + stxt #FIXME: What do about 'date', 'entlangnam' fields? # I don't think we care about 'sendNotJS'. ktxt, rtxt, stxt = reformat(ktxt, rtxt, stxt, entr) entr.ktxt, entr.rtxt, entr.stxt = ktxt, rtxt, stxt return entr
def mkentr(jtxt, etxt): global Lnnum # Create an entry object to represent the "A" line text of the # example sentence. e = jdb.Entr(stat=KW.STAT_A, unap=False) e.srcnote = str(Lnnum) if jdb.jstr_reb(jtxt): e._rdng = [jdb.Rdng(txt=jtxt)] else: e._kanj = [jdb.Kanj(txt=jtxt)] e._sens = [ jdb.Sens( _gloss=[jdb.Gloss(txt=etxt, ginf=KW.GINF_equ, lang=KW.LANG_eng)]) ] return e
def mkentr (jtxt, etxt, kwds): global Lnnum # Create an entry object to represent the "A" line text of the # example sentence. e = jdb.Entr (stat=KW.STAT_A, unap=False) e.srcnote = str (Lnnum) # Each @$kwds item is a 2-array consisting of the kw # id number and optionally a note string. kws = [x[0] for x in kwds] sens_note = "; ".join ([x[1] for x in kwds if len(x)>1]) or None if jdb.jstr_reb (jtxt): e._rdng = [jdb.Rdng (txt=jtxt)] else: e._kanj = [jdb.Kanj (txt=jtxt)] e._sens = [jdb.Sens (notes=sens_note, _gloss=[jdb.Gloss (lang=KW.LANG_eng, ginf=KW.GINF_equ, txt=etxt)], _misc=[jdb.Misc (kw=x) for x in kws])] return e
def do_entr(self, elem, seq, xlit=False, xlang=None, corp_dict=None, grpdefs=None): """ Create an entr object from a parsed ElementTree entry element, 'elem'. 'lineno' is the source file line number of the "<entry>" line or None and is only used in error messages. Note that the entry object returned is different from one read from the database in the following respects: * The 'entr' record will have no .src (aka corpus) attribute if there is no <ent_corp> element in the entry. In this case the .src attribute is expected to be added by the caller. If there is a <ent_corp> element, it will be used to find a corpus in 'corp_dict', which in turn will will provide an id number used in .src. * Items in sense's _xref list are unresolved xrefs, not resolved xrefs as in a database entr object. jdb.resolv_xref() or similar can be used to resolve the xrefs. * Attributes will be missing if the corresponding xml information is not present. For example, if a particular entry has no <ke_ele> elements, the entr object will not have a '._kanj' attribute. In an entr object read from the database, it will have a '._kanj' attribute with a value of []. * The entr object does not have many of the foreign key attributes: gloss.gloss, xref.xref, <anything>.entr, etc. However, it does have rdng.rdng, kanj.kanj, and sens.sens attributes since these are required when adding restr, stagr, stagk, and freq objects. """ XKW, KW = self.XKW, self.KW entr = jdb.Entr() if not seq: elemseq = elem.find('ent_seq') if elemseq is None: raise ParseError("No <ent_seq> element found") try: seq = int(elemseq.text) except ValueError: raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text) if seq <= 0: raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text) entr.seq = seq id = elem.get('id') if id is not None: entr.id = int(id) dfrm = elem.get('dfrm') if dfrm is not None: entr.dfrm = int(dfrm) stat = elem.get('status') or jdb.KW.STAT['A'].id try: stat = XKW.STAT[stat].id except KeyError: raise ParseError("Invalid <status> element value, '%s'" % stat) entr.stat = stat entr.unap = elem.get('appr') == 'n' corpname = elem.findtext('ent_corp') if corpname is not None: entr.src = corp_dict[corpname].id fmap = defaultdict(lambda: ([], [])) self.do_kanjs(elem.findall('k_ele'), entr, fmap) self.do_rdngs(elem.findall('r_ele'), entr, fmap) if fmap: freq_errs = jdb.make_freq_objs(fmap, entr) for x in freq_errs: typ, r, k, kw, val = x kwstr = XKW.FREQ[kw].kw + str(val) self.freq_warn(typ, r, k, kwstr) self.do_senss(elem.findall('sense'), entr, xlit, xlang) self.do_senss(elem.findall('trans'), entr, xlit, xlang) self.do_info(elem.findall("info"), entr) self.do_audio(elem.findall("audio"), entr, jdb.Entrsnd) self.do_groups(elem.findall("group"), entr, grpdefs) return entr
def p_preentr_3(p): '''preentr : kanjsect NL NL senses''' p[0] = jdb.Entr(_kanj=p[1], _sens=p[4])
def p_preentr_2(p): '''preentr : NL rdngsect NL senses''' p[0] = jdb.Entr(_rdng=p[2], _sens=p[4])
def p_preentr_1(p): '''preentr : kanjsect NL rdngsect NL senses''' p[0] = jdb.Entr(_kanj=p[1], _rdng=p[3], _sens=p[5])
def main(args, opts): jdb.reset_encoding(sys.stdout, 'utf-8') errs = [] entrs = [] try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page([str(e)]) fv = form.getfirst fl = form.getlist is_editor = jmcgi.is_editor(sess) dbg = fv('dbg') meth = fv('meth') def_corp = fv('c') # Default corpus for new entries. defcorpid = None if def_corp: try: def_corp = int(def_corp) except ValueError: pass try: defcorpid = jdb.KW.SRC[def_corp].id except KeyError: errs.append("Bad url parameter: c=%s" % def_corp) force_corp = fv('f') # Force default corpus for new entries. sentrs = fl("entr") for sentr in sentrs: try: entrs = serialize.unserialize(sentr) except Exception as e: errs.append("Bad 'entr' value, unable to unserialize: %s" % str(e)) else: entrs.append(entr) jentrs = fl('j') for jentr in jentrs: try: entr = edparse.entr(jentr) except Exception as e: errs.append("Bad 'j' value, unable to parse: %s" % str(e)) else: entr.src = None entrs.append(entr) elist, qlist, active = fl('e'), fl('q'), fv('a') if elist or qlist: entrs.extend( jmcgi.get_entrs(cur, elist or [], qlist or [], errs, active=active, corpus=def_corp) or []) cur.close() if (elist or qlist or jentrs or sentrs) and not entrs: # The caller explictly specified and entry to edit but we # didn't find it (or them). Rather than treating this as # though no entries were given and displaying a blank edit # form, show an error message. errs.append("No matching entries were found") if errs: jmcgi.err_page(errs) srcs = sorted(jdb.KW.recs('SRC'), key=lambda x: x.kw.lower()) #srcs.insert (0, jdb.Obj (id=0, kw='', descr='')) if not entrs: # This is a blank new entry. # The following dummy entry will produce the default # text for new entries: no kanji, no reading, and sense # text "[1][n]". entr = jdb.Entr( _sens=[jdb.Sens(_pos=[jdb.Pos(kw=jdb.KW.POS['n'].id)])], src=None) entrs = [entr] for e in entrs: if not is_editor: remove_freqs(e) e.ISDELETE = (e.stat == jdb.KW.STAT['D'].id) or None # Provide a default corpus. if not e.src: e.src = defcorpid e.NOCORPOPT = force_corp if errs: jmcgi.err_page(errs) for e in entrs: e.ktxt = fmtjel.kanjs(e._kanj) e.rtxt = fmtjel.rdngs(e._rdng, e._kanj) e.stxt = fmtjel.senss(e._sens, e._kanj, e._rdng) if errs: jmcgi.err_page(errs) jmcgi.jinja_page('edform.jinja', parms=parms, extra={}, entrs=entrs, srcs=srcs, is_editor=is_editor, svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg, this_page='edform.py')
def p_preentr_2(p): '''preentr : FF rdngsect FF senses''' p[0] = jdb.Entr(_rdng=p[2], _sens=p[4])
def do_chr(elem, srcid, langs): global Char # Process a <character> element. The element has been # parsed by the xml ElementTree parse and is in "elem". # "lineno" is the source file line number. chtxt = elem.find('literal').text Char = chtxt # For warning messages created by warn(). c = jdb.Chr(chr=chtxt, _cinf=[]) e = jdb.Entr(src=srcid, stat=KW.STAT_A, seq=jdb.uord(chtxt), unap=False, chr=c, _kanj=[jdb.Kanj(txt=chtxt)], _rdng=[], _sens=[], _krslv=[]) for x in elem.findall('codepoint/cp_value'): codepoint(x, c, chtxt) for x in elem.findall('radical/rad_value'): radical(x, c) x = None try: x = (elem.find('misc/freq')).text except: pass if x: if c.freq is not None: warn('Duplicate "freq" element ignored: %s' % x) else: c.freq = int(x) x = None try: x = (elem.find('misc/grade')).text except: pass if x: if c.grade is not None: warn('Duplicate "grade" element ignored: %s' % x) else: c.grade = int(x) for n, x in enumerate(elem.findall('misc/stroke_count')): strokes(x, n, c) rn = '\u3001'.join([x.text for x in elem.findall('misc/rad_name')]) if rn: c.radname = rn for x in elem.findall('reading_meaning'): reading_meaning(x, e._rdng, e._sens, c._cinf, langs) x = elem.find('dic_number') if x is not None: dicnum(x, c._cinf) x = elem.find('query_code') if x is not None: qcode(x, c._cinf) for x in elem.findall('misc/variant'): e._krslv.append(variant(x)) x = elem.find('misc/jlpt') if x is not None: jlptnum(x, c) return e