def parse_kitem(ktxt, tags, fmap): if not jdb.jstr_keb(ktxt): raise ParseError('Kanji field not kanji: "%s".' % ktxt) kanj = Kanj(txt=ktxt) for tag in tags: if not tag: continue t = lookup_tag(tag, ['KINF', 'FREQ']) if t: tagtyp, tagval = t[0] if tagtyp == 'KINF': kanj._inf.append(Kinf(kw=tagval)) elif tagtyp == 'FREQ': fmap[t[1:]][1].append(kanj) else: raise ParseError('Unknown tag "%s" on kanji "%s"' % (tag, ktxt)) return kanj
def parse_restrs(rdng, tag, kanjs): restrtxts = [x.strip(' ') for x in tag.split(';')] errs = [] jdb.txt2restr(restrtxts, rdng, kanjs, '_restr', errs) for err in errs: raise ParseError('Reading restriction "%s" doesn\'t match any kanji' % err)
def entr(text, simple=False): fmap = collections.defaultdict(lambda: ([list(), list()])) #krtxt, x, stxt = text.partition ('/') try: krtxt, stxt = re.split('[ \t\u3000]*/[ \t\u3000]*', text, 1) except ValueError as e: raise ParseError('Missing KR-S separator, "/"') kanjs, rdngs = parse_jppart(krtxt, fmap) entr = Entr(_kanj=kanjs, _rdng=rdngs) sens = parse_spart(stxt.lstrip(), entr, fmap) errs = jdb.make_freq_objs(fmap, entr) for err in errs: errtyp, r, k, kw, val = err raise ParseError("%s freq tag(s) %s%s in %s%s%s" % (errtyp, KW.FREQ[kw].kw, val, k or '', '\u30FB' if k and r else '', r or '')) return entr
def parse_stags(tag, sens, kanjs, rdngs): stagrtxts = [] stagktxts = [] words = tag.split(',') for word in words: word = word.strip() if jdb.jstr_reb(word): stagrtxts.append(word) elif jdb.jstr_keb(word): stagktxts.append(word) else: raise ParseError( 'stagx restriction word neither reading or kanji: "%s"' % word) errs = [] jdb.txt2restr(stagrtxts, sens, rdngs, '_stagr', bad=errs) if errs: raise ParseError('Stagr text not in readings: "%s"' % '","'.join(errs)) errs = [] jdb.txt2restr(stagktxts, sens, kanjs, '_stagk', bad=errs) if errs: raise ParseError('Stagk text not in kanji: "%s"' % '","'.join(errs)) return
def parse_tags(tagtxt, sens, snum): tags = tagtxt.split(',') found = failed = 0 pos = [] misc = [] fld = [] dial = [] for tag in tags: tag = tag.strip() if not tag: raise ParseError("Empty tag in sense %d.", (snum)) continue tagx = tag[:-1] if tag[-1] == ':' else tag t = lookup_tag(tagx, ['POS', 'MISC', 'DIAL', 'FLD']) if t: typ, val = t[0] if len(t) > 1: raise ParseError( 'Ambiguous tag "%s", interpreting as "%s" but could be "%s"' % (tag, typ, '","'.join([x[0] for x in t[1:]]))) if typ == 'POS': pos.append(Pos(kw=val)) elif typ == 'MISC': misc.append(Misc(kw=val)) elif typ == 'FLD': fld.append(Fld(kw=val)) elif typ == 'DIAL': dial.append(Dial(kw=val)) else: raise ValueError(typ) found += 1 else: failed += 1 # Don't report failed tag lookups because likely # we are mistakenly processing a sense note or gloss # prefix which will be correctly handled by the caller. #raise ParseError ('Unknown sense tag in sense %d: "%s".' % (snum, tag)) # Because of the loosy-goosy edict syntax, if there were # erroneous tags, we don't know if they were really erroneous # tags or we were trying to parse a sense note or something. # So return all the information we have and let the caller # decide if she wants to use the valid tags or throw out # everything. return failed, pos, misc, fld, dial
def parse_xrefs(txt, sens): # Following regex is used to allow any xref type designator # separated from the xref text by either or both colon or spaces. p = re.split(r'^(?:([a-zA-Z]+)(?:[: ]+))', txt) if len(p) != 3: raise ParseError('Xref "%s", bad format' % txt) return typ, xtxt = p[1:3] xtyp = jdb.KW.XREF[typ.lower()].id xrefs = re.split(r'[, ]', xtxt) xrsvs = [] for n, x in enumerate(xrefs): if not x: continue krs = x.split('\u30FB') if len(krs) > 3 or len(krs) == 0: raise ParseError('Xref "%s", bad format' % x) continue # 'krs' has 1, 2, or 3 items. Using "x" to indicate a non- # existent item, the valid arrangements if kanji, reading, # and sense number are: # Kxx, KRx, KRS, KSx, Rxx RSx # or rephrased in terms of what part of the xref can be in # what item: # [0]:KR, [1]:RS, [2]:S ktxt = None rtxt = None tsens = None for n, v in enumerate(krs): if n == 0: # v is K or R if jdb.jstr_reb(v): rtxt = v else: ktxt = v elif n == 1: # v is R or S (if n==0 was K) or S (if n==0 was R) if v.isdigit(): tsens = int(v) elif jdb.jstr_reb(v): if rtxt: raise ParseError( 'Xref "%s", two reading parts present' % x) break rtxt = v else: raise ParseError('Xref "%s", two kanji parts present' % x) break else: # v is S (n==1 must have been R) if not v.isdigit(): raise ParseError('Xref "%s", "%s" is not a sense number' % (x, v)) break if tsens: raise ParseError('Xref "%s", has two sense numbers' % x) break tsens = int(v) else: xrsvs.append( Xrslv(typ=xtyp, ord=n + 1, ktxt=ktxt, rtxt=rtxt, tsens=tsens)) if xrsvs: if not getattr(sens, '_xrslv', None): sens._xrslv = [] sens._xrslv.extend(xrsvs)
def parse_ritem(rtxt, tags, fmap, kanjs): # FIXME: Following check disabled because the jdb.jstr__reb() # test as cutrrently written is too strict and rejects some # texts that should be allowed (see IS-26). More immediately # jwb uses edform.py's 'j' option to parse a edict line with # a question mark (not sure if ascii or jis) in the reading # field. #if not jdb.jstr_reb (rtxt): # raise ParseError ('Reading field not kana: "%s".' % rtxt) rdng = Rdng(txt=rtxt) for tag in tags: if not tag: continue if not jdb.jstr_gloss(tag): parse_restrs(rdng, tag, kanjs) continue t = lookup_tag(tag, ['RINF', 'FREQ']) if t: tagtyp, tagval = t[0] if tagtyp == 'RINF': rdng._inf.append(Rinf(kw=tagval)) elif tagtyp == 'FREQ': fmap[t[1:]][0].append(rdng) else: raise ParseError('Unknown tag "%s" on reading "%s"' % (tag, rtxt)) return rdng
def extract_lsrc_or_ginf(gtxt): # This will find lsrc ot ginf descriptions where the text before # the colon is a three-letter language code, or "wasei:", or a # ginf tag ("lit:", "fig:", "expl:") # We extract only a single clause which must occur at the end # of a gloss. # # Return a 3-tuple: # [0] -- Gloss with lsrc/ginf removed. # [1] -- (None,None) or 2-tuple: # [0] -- GINF keyword id number. # [1] -- Ginf text. # [2] -- List (possibly empty) of 3-tuples: # [0] -- Language id number. # [1] -- True if "wasei". # [2] -- Lsource text. # # The following regex will match a substring like "(ger: xxxx)". # The "xxxx" part may have parenthesised text but not nested. # Thus, "eng: foo (on you) dude" will be correctly parsed, but # "eng: (foo (on you)) dude" won't. Also note that an lsrc # string may contain multiple comma-separated sections: # "(ger: xxx, fre: yyy)" # A similar regex is used in jmxml.extract_lit() so if a # revision is needed here, that function should be checked # as well. KW = jdb.KW regex = r'\s*\(([a-z]{3,5}):\s*((([^()]+)|(\([^)]+\)))+?)\)\s*$' mo = re.search(regex, gtxt) if not mo: return gtxt, (None, None), [] tag, ptext = mo.group(1, 2) div = mo.start() # Division point between gloss and ptext. # First check if 'tag' is GINF tag. tagid = None rec = KW.GINF.get(tag) if rec: tagid = rec.id if tagid: # It was, return the gloss sans ptext, and the ginf text tuple. return gtxt[:div].strip(), (tagid, ptext), [] # Check for lsource. There may be multiple, comma-separated # lsource clauses within the parens. But the lsource text may # also contain commas so we need to do better then splitting # on comas. # Getthe matched clause which is prefixed and suffixed with # parend and whitespace. Strip whitespace and leading paren. # There will be at most one "(". fulltxt = mo.group().strip(' (') # There may be multiple ")" and we must remove only one. if fulltxt[-1] == ')': fulltxt = fulltxt[:-1] lsrctxts = [] # Split on a pattern that matches the tag part of the lsource, # e.g. "ger:". Require the tag to be at start of string or preceeded # by a non-alpha character to avoid matching a non-lang tag like # "xxxx:". Special case "wasei". lsrcx = re.split(r'(?:^|[^a-z])((?:[a-z]{3}|wasei):)', fulltxt) if len(lsrcx) < 3: # Not an lsources text string. return gtxt, (None, None), [] # The list from the split has alternating tag and text elements, # with empty elements interspersed. Collect each tag element # (identified by a ":" suffix) together with a possible following # lsource text, in as pairs in 'lsrctxts'. for x in lsrcx: x = x.strip(' ,') if len(x) > 3 and x[-1] == ':': pair = [x[:-1], ''] lsrctxts.append(pair) elif x: if pair[1]: raise ValueError(pair[1]) # Should not happen. pair[1] = x lsrctuples = [] for lang, txt in lsrctxts: # Give each lang,txt pair to parse_lsrc() to decipher. # It will throw an exception if "lang" is not recognised. # If that happens, abort processing of all the lsource # specs and presume the entire (tetative) lsource text # is part of the gloss. try: lsrctuple = parse_lsrc(lang, txt) except KeyError as e: raise ParseError('Lsrc keyerror on "%s" in "%s"' % (str(e), ptext)) return (gtxt, (None, None), []) # If it parsed ok, add it to the collection. if lsrctuple: lsrctuples.append(lsrctuple) return gtxt[:div].strip(), (None, None), lsrctuples
def process_sense(tags, glosstxts, snum, prev_pos, kanjs, rdngs): sens = Sens() # Tags may be (in the order listed): # After Before # ~2010-06-22 ~2010-06-22 # ----------- ----------- # POS POS # sense_num sense_num # STAG s_inf # MISC STAG # FLD see, ant # DIAL MISC # s_inf DIAL # see, ant FLD # gloss gloss # lsrc lsrc # (P) (P) # # DON'T FIXME: # Earlier rev of this file commented that we could use # tag order to resolve ambiguities (in some cases) and # provide better error messages. However the 2010-06-22 # change of tag order makes such a change less appealing. for tag in tags: # Classify the type of tag... if tag.isdigit(): # Sense number. # Don;'t do anything with it other than check # that is what was expected. There is no check # for duplicates. if int(tag) != snum: raise ParseError('Sense number "%s" out of order' % tag) continue if tag.lower().startswith ('see') or \ tag.lower().startswith ('ant'): # XREF parse_xrefs(tag, sens) continue if tag.endswith(' only'): # STAG parse_stags(tag[:-5], sens, kanjs, rdngs) continue # Strip off any trailing ":" (which dialect tags will have) but # change it on a temp variable because we don't know for sure # that this is a tag yet, if re.match(r'[a-zA-Z0-9,:-]+$', tag): # Could be pos, misc, dial or fld tags. failed, pos, misc, fld, dial = parse_tags(tag, sens, snum) if not failed: if pos: sens._pos.extend(pos) if misc: sens._misc.extend(misc) if fld: sens._fld.extend(fld) if dial: sens._dial.extend(dial) continue # If not all the tags were ok, fallthough to following code # to process as sense note or gloss prefix. if 1: # The "1 or ..." bit below disables the extraction of # leading parenthesised text into the sense note. # This is a temporary change until such behavior can # be parameterized (or refined as noted in the comments # below re field order), and is to support the need of # the tools/jbsubs.py script which uses edparse and where # leading parenthesised text is usually text and rarely # a note. if 1 or sens.notes: # If we already found a sense note, then put this # current unidentifiable text back onto the first gloss. # FIXME? May loose whitespace that was in original line. # FIXME: If multiple tags are pushed back onto gloses, # they are put back in reversed (wrong) order and white- # space between then is lost. # FIXME: I believe that at least for JB's edict2 file, # fields are always in the same order and that s_inf # comes before stagr/stagk, see/ant, MISC, DIAL, FLD, # so if we have seen any of those fields, we can say # that we are looking at gloss text now. # UPDATE: Around 2010-06-22, jwb changed the order # of the tags in the EDRDG Edict2 file. # The new order, announced on the edict-jmdict list # is: POS, sense number, restrictions, e,g, (XXX only), # misc, field/domain, sense_inf/note, xref/see, xref/ant, # gloss, lsrc stuff, (P) if not on the kanji/kana. #raise ParseError ('note="%s", dup="%s"' % (sens.notes, tag)) if len(glosstxts) < 1: glosstxts.append('') glosstxts[0] = '(' + tag + ') ' + glosstxts[0] else: sens.notes = tag.strip() if not sens._pos and prev_pos: # If this sense had no part-of-speech tags, inherit them # from the previous sense, if any. # FIXME: Should make a copy of 'prev_pos' rather than using ref. sens._pos = prev_pos for gtxt in glosstxts: glosses = parse_gloss(gtxt, sens) return sens
def parse_spart(txt, entr, fmap): kanjs = getattr(entr, '_kanj', []) rdngs = getattr(entr, '_rdng', []) gtxts = txt.split('/') if gtxts[-1] == '': gtxts.pop() else: pass #raise ParseError ('No trailing "/" on last gloss') senslist = [] for n, gtxt in enumerate(gtxts): new_sense = (n == 0) front_opts_list = [] gtxt = gtxt.strip() if gtxt == '': continue elif gtxt == '(P)': fklist, frlist = fmap[(jdb.KW.FREQ['spec'].id, 1)] # If a "spec1" has already been applied to any kanji or # reading, this sense (P) tag is redundent and can be ignored. if fklist or frlist: continue if len(kanjs) > 1 or len(rdngs) > 1: # If there is more than 1 kanji or reading, then at least # one of them should have a "spec1" tag applied as a result # of a required explicit kanji or reading P tag. raise ParseError( "P tag in sense, but not in kanji or readings") # If there is only one reading and/or kanj a P tag on them is # not required so we assign the corresponding "spec1" tag on # them here. (Also assign if multiple kanji/readings since # warning has been given.) if kanjs: add_spec1(fmap, kanjs[0], "k") if rdngs: add_spec1(fmap, rdngs[0], "r") continue mo = re.match(r'^EntL([0-9]+)', gtxt) if mo: if n == len(gtxts) - 1: entr.seq = int(mo.group(1)) else: raise ParseError( "SEQ number pseuo-gloss '%s' in wrong place, %r" % (gtxt, (n, len(gtxts)))) continue # The following regex will match an arbitrary number # of sequential parenthesised texts, optionally followed # by one curly-bracketed one, at the start of a text string. # Note that although most such tags occur on the first # gloss of a sense, that is not a requirement, we will # apply tags to the current sense regardles of the gloss # it occurs with. mo = re.match(r'(\([^)]*\)\s*)+\s*({[^}]*})?\s*', gtxt) if mo: front_opts_txt = mo.group() # FIXME: We throw away info on whether a tag occured in # parens or brackets, which implies that we must rely # on FLD tag values being distinct from other tags. front_opts_list = re.split(r'[)}]\s*[({]', front_opts_txt.strip('(){} ')) # Strip the leading paren'd text off 'gtxt'. gtxt = gtxt[mo.end():] if not new_sense: # See if there is a sense number in the paren'd # options. If so, this gloss is the first one of # a new sense. No need for this check if n==0 # since the first gloss is always a new sense. for x in front_opts_list: if x.isdigit(): new_sense = True if not new_sense: # We have a gloss that is not the first of a sense # but that has parenthesised leading text. Put the # text back on the gloss. gtxt = front_opts_txt + gtxt front_opts_list = [] if new_sense: glosses = [] senslist.append((front_opts_list, glosses)) if gtxt: glosses.append(gtxt) senss = parse_senses(senslist, kanjs, rdngs) entr._sens = senss