Example #1
0
def parse_kitem(ktxt, tags, fmap):
    if not jdb.jstr_keb(ktxt):
        raise ParseError('Kanji field not kanji: "%s".' % ktxt)
    kanj = Kanj(txt=ktxt)
    for tag in tags:
        if not tag: continue
        t = lookup_tag(tag, ['KINF', 'FREQ'])
        if t:
            tagtyp, tagval = t[0]
            if tagtyp == 'KINF': kanj._inf.append(Kinf(kw=tagval))
            elif tagtyp == 'FREQ': fmap[t[1:]][1].append(kanj)
        else:
            raise ParseError('Unknown tag "%s" on kanji "%s"' % (tag, ktxt))
    return kanj
Example #2
0
def parse_restrs(rdng, tag, kanjs):
    restrtxts = [x.strip(' ') for x in tag.split(';')]
    errs = []
    jdb.txt2restr(restrtxts, rdng, kanjs, '_restr', errs)
    for err in errs:
        raise ParseError('Reading restriction "%s" doesn\'t match any kanji' %
                         err)
Example #3
0
def entr(text, simple=False):
    fmap = collections.defaultdict(lambda: ([list(), list()]))
    #krtxt, x, stxt = text.partition ('/')
    try:
        krtxt, stxt = re.split('[ \t\u3000]*/[ \t\u3000]*', text, 1)
    except ValueError as e:
        raise ParseError('Missing KR-S separator, "/"')
    kanjs, rdngs = parse_jppart(krtxt, fmap)
    entr = Entr(_kanj=kanjs, _rdng=rdngs)
    sens = parse_spart(stxt.lstrip(), entr, fmap)
    errs = jdb.make_freq_objs(fmap, entr)
    for err in errs:
        errtyp, r, k, kw, val = err
        raise ParseError("%s freq tag(s) %s%s in %s%s%s" %
                         (errtyp, KW.FREQ[kw].kw, val, k
                          or '', '\u30FB' if k and r else '', r or ''))
    return entr
Example #4
0
def parse_stags(tag, sens, kanjs, rdngs):
    stagrtxts = []
    stagktxts = []
    words = tag.split(',')
    for word in words:
        word = word.strip()
        if jdb.jstr_reb(word): stagrtxts.append(word)
        elif jdb.jstr_keb(word): stagktxts.append(word)
        else:
            raise ParseError(
                'stagx restriction word neither reading or kanji: "%s"' % word)
    errs = []
    jdb.txt2restr(stagrtxts, sens, rdngs, '_stagr', bad=errs)
    if errs:
        raise ParseError('Stagr text not in readings: "%s"' % '","'.join(errs))
    errs = []
    jdb.txt2restr(stagktxts, sens, kanjs, '_stagk', bad=errs)
    if errs:
        raise ParseError('Stagk text not in kanji: "%s"' % '","'.join(errs))
    return
Example #5
0
def parse_tags(tagtxt, sens, snum):
    tags = tagtxt.split(',')
    found = failed = 0
    pos = []
    misc = []
    fld = []
    dial = []
    for tag in tags:
        tag = tag.strip()
        if not tag:
            raise ParseError("Empty tag in sense %d.", (snum))
            continue
        tagx = tag[:-1] if tag[-1] == ':' else tag
        t = lookup_tag(tagx, ['POS', 'MISC', 'DIAL', 'FLD'])
        if t:
            typ, val = t[0]
            if len(t) > 1:
                raise ParseError(
                    'Ambiguous tag "%s", interpreting as "%s" but could be "%s"'
                    % (tag, typ, '","'.join([x[0] for x in t[1:]])))
            if typ == 'POS': pos.append(Pos(kw=val))
            elif typ == 'MISC': misc.append(Misc(kw=val))
            elif typ == 'FLD': fld.append(Fld(kw=val))
            elif typ == 'DIAL': dial.append(Dial(kw=val))
            else: raise ValueError(typ)
            found += 1
        else:
            failed += 1
            # Don't report failed tag lookups because likely
            # we are mistakenly processing a sense note or gloss
            # prefix which will be correctly handled by the caller.
            #raise ParseError ('Unknown sense tag in sense %d: "%s".' % (snum, tag))

    # Because of the loosy-goosy edict syntax, if there were
    # erroneous tags, we don't know if they were really erroneous
    # tags or we were trying to parse a sense note or something.
    # So return all the information we have and let the caller
    # decide if she wants to use the valid tags or throw out
    # everything.
    return failed, pos, misc, fld, dial
Example #6
0
def parse_xrefs(txt, sens):
    # Following regex is used to allow any xref type designator
    # separated from the xref text by either or both colon or spaces.
    p = re.split(r'^(?:([a-zA-Z]+)(?:[: ]+))', txt)
    if len(p) != 3:
        raise ParseError('Xref "%s", bad format' % txt)
        return
    typ, xtxt = p[1:3]
    xtyp = jdb.KW.XREF[typ.lower()].id
    xrefs = re.split(r'[, ]', xtxt)
    xrsvs = []
    for n, x in enumerate(xrefs):
        if not x: continue
        krs = x.split('\u30FB')
        if len(krs) > 3 or len(krs) == 0:
            raise ParseError('Xref "%s", bad format' % x)
            continue

        # 'krs' has 1, 2, or 3 items.  Using "x" to indicate a non-
        # existent item, the valid arrangements if kanji, reading,
        # and sense number are:
        #   Kxx, KRx, KRS, KSx, Rxx RSx
        # or rephrased in terms of what part of the xref can be in
        # what item:
        #    [0]:KR, [1]:RS, [2]:S

        ktxt = None
        rtxt = None
        tsens = None
        for n, v in enumerate(krs):
            if n == 0:  # v is K or R
                if jdb.jstr_reb(v): rtxt = v
                else: ktxt = v
            elif n == 1:  # v is R or S (if n==0 was K) or S (if n==0 was R)
                if v.isdigit(): tsens = int(v)
                elif jdb.jstr_reb(v):
                    if rtxt:
                        raise ParseError(
                            'Xref "%s", two reading parts present' % x)
                        break
                    rtxt = v
                else:
                    raise ParseError('Xref "%s", two kanji parts present' % x)
                    break
            else:  # v is S (n==1 must have been R)
                if not v.isdigit():
                    raise ParseError('Xref "%s", "%s" is not a sense number' %
                                     (x, v))
                    break
                if tsens:
                    raise ParseError('Xref "%s", has two sense numbers' % x)
                    break
                tsens = int(v)
        else:
            xrsvs.append(
                Xrslv(typ=xtyp, ord=n + 1, ktxt=ktxt, rtxt=rtxt, tsens=tsens))
    if xrsvs:
        if not getattr(sens, '_xrslv', None): sens._xrslv = []
        sens._xrslv.extend(xrsvs)
Example #7
0
def parse_ritem(rtxt, tags, fmap, kanjs):
    # FIXME: Following check disabled because the jdb.jstr__reb()
    #  test as cutrrently written is too strict and rejects some
    #  texts that should be allowed (see IS-26).  More immediately
    #  jwb uses edform.py's 'j' option to parse a edict line with
    #  a question mark (not sure if ascii or jis) in the reading
    #  field.
    #if not jdb.jstr_reb (rtxt):
    #    raise ParseError ('Reading field not kana: "%s".' % rtxt)
    rdng = Rdng(txt=rtxt)
    for tag in tags:
        if not tag: continue
        if not jdb.jstr_gloss(tag):
            parse_restrs(rdng, tag, kanjs)
            continue
        t = lookup_tag(tag, ['RINF', 'FREQ'])
        if t:
            tagtyp, tagval = t[0]
            if tagtyp == 'RINF': rdng._inf.append(Rinf(kw=tagval))
            elif tagtyp == 'FREQ': fmap[t[1:]][0].append(rdng)
        else:
            raise ParseError('Unknown tag "%s" on reading "%s"' % (tag, rtxt))

    return rdng
Example #8
0
def extract_lsrc_or_ginf(gtxt):
    # This will find lsrc ot ginf descriptions where the text before
    # the colon is a three-letter language code, or "wasei:", or a
    # ginf tag ("lit:", "fig:", "expl:")
    # We extract only a single clause which must occur at the end
    # of a gloss.
    #
    # Return a 3-tuple:
    #   [0] -- Gloss with lsrc/ginf removed.
    #   [1] -- (None,None) or 2-tuple:
    #       [0] -- GINF keyword id number.
    #       [1] -- Ginf text.
    #   [2] -- List (possibly empty) of 3-tuples:
    #       [0] -- Language id number.
    #       [1] -- True if "wasei".
    #       [2] -- Lsource text.
    #

    # The following regex will match a substring like "(ger: xxxx)".
    # The "xxxx" part may have parenthesised text but not nested.
    # Thus, "eng: foo (on you) dude" will be correctly parsed, but
    # "eng: (foo (on you)) dude" won't.  Also note that an lsrc
    # string may contain multiple comma-separated sections:
    # "(ger: xxx, fre: yyy)"
    # A similar regex is used in jmxml.extract_lit() so if a
    # revision is needed here, that function should be checked
    # as well.
    KW = jdb.KW
    regex = r'\s*\(([a-z]{3,5}):\s*((([^()]+)|(\([^)]+\)))+?)\)\s*$'
    mo = re.search(regex, gtxt)
    if not mo: return gtxt, (None, None), []

    tag, ptext = mo.group(1, 2)
    div = mo.start()  # Division point between gloss and ptext.

    # First check if 'tag' is GINF tag.
    tagid = None
    rec = KW.GINF.get(tag)
    if rec: tagid = rec.id
    if tagid:
        # It was, return the gloss sans ptext, and the ginf text tuple.
        return gtxt[:div].strip(), (tagid, ptext), []

    # Check for lsource.  There may be multiple, comma-separated
    # lsource clauses within the parens.  But the lsource text may
    # also contain commas so we need to do better then splitting
    # on comas.

    # Getthe matched clause which is prefixed and suffixed with
    # parend and whitespace.  Strip whitespace and leading paren.
    # There will be at most one "(".
    fulltxt = mo.group().strip(' (')
    # There may be multiple ")" and we must remove only one.
    if fulltxt[-1] == ')': fulltxt = fulltxt[:-1]
    lsrctxts = []
    # Split on a pattern that matches the tag part of the lsource,
    # e.g. "ger:".  Require the tag to be at start of string or preceeded
    # by a non-alpha character to avoid matching a non-lang tag like
    # "xxxx:".  Special case "wasei".
    lsrcx = re.split(r'(?:^|[^a-z])((?:[a-z]{3}|wasei):)', fulltxt)
    if len(lsrcx) < 3:  # Not an lsources text string.
        return gtxt, (None, None), []
    # The list from the split has alternating tag and text elements,
    # with empty elements interspersed.  Collect each tag element
    # (identified by a ":" suffix) together with a possible following
    # lsource text, in as pairs in 'lsrctxts'.
    for x in lsrcx:
        x = x.strip(' ,')
        if len(x) > 3 and x[-1] == ':':
            pair = [x[:-1], '']
            lsrctxts.append(pair)
        elif x:
            if pair[1]: raise ValueError(pair[1])  # Should not happen.
            pair[1] = x

    lsrctuples = []
    for lang, txt in lsrctxts:
        # Give each lang,txt pair to parse_lsrc() to decipher.
        # It will throw an exception if "lang" is not recognised.
        # If that happens, abort processing of all the lsource
        # specs and presume the entire (tetative) lsource text
        # is part of the gloss.
        try:
            lsrctuple = parse_lsrc(lang, txt)
        except KeyError as e:
            raise ParseError('Lsrc keyerror on "%s" in "%s"' % (str(e), ptext))
            return (gtxt, (None, None), [])
        # If it parsed ok, add it to the collection.
        if lsrctuple: lsrctuples.append(lsrctuple)
    return gtxt[:div].strip(), (None, None), lsrctuples
Example #9
0
def process_sense(tags, glosstxts, snum, prev_pos, kanjs, rdngs):
    sens = Sens()
    # Tags may be (in the order listed):
    #     After       Before
    #     ~2010-06-22 ~2010-06-22
    #     ----------- -----------
    #     POS         POS
    #     sense_num   sense_num
    #     STAG        s_inf
    #     MISC        STAG
    #     FLD         see, ant
    #     DIAL        MISC
    #     s_inf       DIAL
    #     see, ant    FLD
    #     gloss       gloss
    #     lsrc        lsrc
    #     (P)         (P)
    #
    # DON'T FIXME:
    # Earlier rev of this file commented that we could use
    # tag order to resolve ambiguities (in some cases) and
    # provide better error messages.  However the 2010-06-22
    # change of tag order makes such a change less appealing.

    for tag in tags:
        # Classify the type of tag...
        if tag.isdigit():  # Sense number.
            # Don;'t do anything with it other than check
            # that is what was expected.  There is no check
            # for duplicates.
            if int(tag) != snum:
                raise ParseError('Sense number "%s" out of order' % tag)
            continue
        if tag.lower().startswith ('see') or \
              tag.lower().startswith ('ant'):       # XREF
            parse_xrefs(tag, sens)
            continue
        if tag.endswith(' only'):  # STAG
            parse_stags(tag[:-5], sens, kanjs, rdngs)
            continue
        # Strip off any trailing ":" (which dialect tags will have) but
        # change it on a temp variable because we don't know for sure
        # that this is a tag yet,
        if re.match(r'[a-zA-Z0-9,:-]+$',
                    tag):  # Could be pos, misc, dial or fld tags.
            failed, pos, misc, fld, dial = parse_tags(tag, sens, snum)
            if not failed:
                if pos: sens._pos.extend(pos)
                if misc: sens._misc.extend(misc)
                if fld: sens._fld.extend(fld)
                if dial: sens._dial.extend(dial)
                continue
            # If not all the tags were ok, fallthough to following code
            # to process as sense note or gloss prefix.
        if 1:
            # The "1 or ..." bit below disables the extraction of
            # leading parenthesised text into the sense note.
            # This is a temporary change until such behavior can
            # be parameterized (or refined as noted in the comments
            # below re field order), and is to support the need of
            # the tools/jbsubs.py script which uses edparse and where
            # leading parenthesised text is usually text and rarely
            # a note.
            if 1 or sens.notes:
                # If we already found a sense note, then put this
                # current unidentifiable text back onto the first gloss.
                # FIXME? May loose whitespace that was in original line.
                # FIXME: If multiple tags are pushed back onto gloses,
                #  they are put back in reversed (wrong) order and white-
                #  space between then is lost.
                # FIXME: I believe that at least for JB's edict2 file,
                #  fields are always in the same order and that s_inf
                #  comes before stagr/stagk, see/ant, MISC, DIAL, FLD,
                #  so if we have seen any of those fields, we can say
                #  that we are looking at gloss text now.
                #  UPDATE: Around 2010-06-22, jwb changed the order
                #  of the tags in the EDRDG Edict2 file.
                #  The new order, announced on the edict-jmdict list
                #  is: POS, sense number, restrictions, e,g, (XXX only),
                #  misc, field/domain, sense_inf/note, xref/see, xref/ant,
                #  gloss, lsrc stuff, (P) if not on the kanji/kana.
                #raise ParseError ('note="%s", dup="%s"' % (sens.notes, tag))
                if len(glosstxts) < 1: glosstxts.append('')
                glosstxts[0] = '(' + tag + ') ' + glosstxts[0]
            else:
                sens.notes = tag.strip()

    if not sens._pos and prev_pos:
        # If this sense had no part-of-speech tags, inherit them
        # from the previous sense, if any.
        # FIXME: Should make a copy of 'prev_pos' rather than using ref.
        sens._pos = prev_pos

    for gtxt in glosstxts:
        glosses = parse_gloss(gtxt, sens)

    return sens
Example #10
0
def parse_spart(txt, entr, fmap):
    kanjs = getattr(entr, '_kanj', [])
    rdngs = getattr(entr, '_rdng', [])
    gtxts = txt.split('/')
    if gtxts[-1] == '': gtxts.pop()
    else: pass  #raise ParseError ('No trailing "/" on last gloss')
    senslist = []
    for n, gtxt in enumerate(gtxts):
        new_sense = (n == 0)
        front_opts_list = []
        gtxt = gtxt.strip()
        if gtxt == '': continue
        elif gtxt == '(P)':
            fklist, frlist = fmap[(jdb.KW.FREQ['spec'].id, 1)]
            # If a "spec1" has already been applied to any kanji or
            # reading, this sense (P) tag is redundent and can be ignored.
            if fklist or frlist: continue
            if len(kanjs) > 1 or len(rdngs) > 1:
                # If there is more than 1 kanji or reading, then at least
                # one of them should have a "spec1" tag applied as a result
                # of a required explicit kanji or reading P tag.
                raise ParseError(
                    "P tag in sense, but not in kanji or readings")
            # If there is only one reading and/or kanj a P tag on them is
            # not required so we assign the corresponding "spec1" tag on
            # them here.  (Also assign if multiple kanji/readings since
            # warning has been given.)
            if kanjs: add_spec1(fmap, kanjs[0], "k")
            if rdngs: add_spec1(fmap, rdngs[0], "r")
            continue
        mo = re.match(r'^EntL([0-9]+)', gtxt)
        if mo:
            if n == len(gtxts) - 1: entr.seq = int(mo.group(1))
            else:
                raise ParseError(
                    "SEQ number pseuo-gloss '%s' in wrong place, %r" %
                    (gtxt, (n, len(gtxts))))
            continue

        # The following regex will match an arbitrary number
        # of sequential parenthesised texts, optionally followed
        # by one curly-bracketed one, at the start of a text string.
        # Note that although most such tags occur on the first
        # gloss of a sense, that is not a requirement, we will
        # apply tags to the current sense regardles of the gloss
        # it occurs with.
        mo = re.match(r'(\([^)]*\)\s*)+\s*({[^}]*})?\s*', gtxt)
        if mo:
            front_opts_txt = mo.group()
            # FIXME: We throw away info on whether a tag occured in
            #  parens or brackets, which implies that we must rely
            #  on FLD tag values being distinct from other tags.
            front_opts_list = re.split(r'[)}]\s*[({]',
                                       front_opts_txt.strip('(){} '))
            # Strip the leading paren'd text off 'gtxt'.
            gtxt = gtxt[mo.end():]

            if not new_sense:
                # See if there is a sense number in the paren'd
                # options.  If so, this gloss is the first one of
                # a new sense.  No need for this check if n==0
                # since the first gloss is always a new sense.
                for x in front_opts_list:
                    if x.isdigit(): new_sense = True
            if not new_sense:
                # We have a gloss that is not the first of a sense
                # but that has parenthesised leading text.  Put the
                # text back on the gloss.
                gtxt = front_opts_txt + gtxt
                front_opts_list = []
        if new_sense:
            glosses = []
            senslist.append((front_opts_list, glosses))
        if gtxt: glosses.append(gtxt)
    senss = parse_senses(senslist, kanjs, rdngs)
    entr._sens = senss