Beispiel #1
0
def merge_freqs (entr):
        # This function is used by code that contructs Entr objects
        # by parsing a textual entry description.  Generally such code
        # will parse freq (a.k.a. prio) tags for readings and kanji
        # individually.  Before the entry is used, these independent
        # tags must be combined so that a rdng/kanj pairs with the
        # same freq tag point to a single Freq object.  This function
        # does that merging.
        # It expects the entry's Rdng and Kanj objects to have a temp
        # attribute named "_FREQ" that contains a list of 2-tuples.
        # Each 2-tuple contains the freq table kw id number, and the
        # freq value.  After  merge_freqs() runs, all those .FREQ
        # attributes will have been deleted, and .freq attributes
        # created with equivalent, properly linked Freq objects.

        fmap = defaultdict (lambda:([list(),list()]))

          # Collect the info in .FREQ attributes from all the readings.
        for r in getattr (entr, '_rdng', []):
            for kw_val in getattr (r, '_FREQ', []):
                  # 'kw_val' is a 2-tuple denoting the freq as a freq table
                  # keyword id and freq value pair.
                rlist = fmap[(kw_val)][0]
                  # Add 'r' to rlist if it is not there already.
                  # Use first() as a "in" operator that uses "is" rather
                  #  than "==" as compare function.
                if not jdb.isin (r, rlist): rlist.append (r)
            if hasattr (r, '_FREQ'): del r._FREQ

          # Collect the info in .FREQ attributes from all the kanji.
          # This works on kanj's the same as above section works on
          # rdng's and comments above apply here too.
        for k in getattr (entr, '_kanj', []):
            for kw_val in getattr (k, '_FREQ', []):
                klist = fmap[(kw_val)][1]
                if not jdb.isin (k, klist): klist.append (k)
            if hasattr (k, '_FREQ'): del k._FREQ

          # 'fmap' now has one entry for every unique freq (kw,value) tuple
          # which is a pair of sets.  The first set consists of all Rdng
          # objects that (kw,value) freq spec applies to.  The second is
          # the set of all kanji it applies to.  We take all combinations
          # of readings with kanji, and create a Freq object for each.

        errs = jdb.make_freq_objs (fmap, entr)
        return errs
Beispiel #2
0
def entr(text, simple=False):
    fmap = collections.defaultdict(lambda: ([list(), list()]))
    #krtxt, x, stxt = text.partition ('/')
    try:
        krtxt, stxt = re.split('[ \t\u3000]*/[ \t\u3000]*', text, 1)
    except ValueError as e:
        raise ParseError('Missing KR-S separator, "/"')
    kanjs, rdngs = parse_jppart(krtxt, fmap)
    entr = Entr(_kanj=kanjs, _rdng=rdngs)
    sens = parse_spart(stxt.lstrip(), entr, fmap)
    errs = jdb.make_freq_objs(fmap, entr)
    for err in errs:
        errtyp, r, k, kw, val = err
        raise ParseError("%s freq tag(s) %s%s in %s%s%s" %
                         (errtyp, KW.FREQ[kw].kw, val, k
                          or '', '\u30FB' if k and r else '', r or ''))
    return entr
Beispiel #3
0
    def do_entr(self,
                elem,
                seq,
                xlit=False,
                xlang=None,
                corp_dict=None,
                grpdefs=None):
        """
    Create an entr object from a parsed ElementTree entry
    element, 'elem'.  'lineno' is the source file line number
    of the "<entry>" line or None and is only used in error
    messages.

    Note that the entry object returned is different from one
    read from the database in the following respects:
    * The 'entr' record will have no .src (aka corpus) attribute
      if there is no <ent_corp> element in the entry.  In this
      case the .src attribute is expected to be added by the
      caller.  If there is a <ent_corp> element, it will be
      used to find a corpus in 'corp_dict', which in turn will
      will provide an id number used in .src.
    * Items in sense's _xref list are unresolved xrefs, not
      resolved xrefs as in a database entr object.
      jdb.resolv_xref() or similar can be used to resolve the
      xrefs.
    * Attributes will be missing if the corresponding xml
      information is not present.  For example, if a particular
      entry has no <ke_ele> elements, the entr object will not
      have a '._kanj' attribute.  In an entr object read from
      the database, it will have a '._kanj' attribute with a
      value of [].
    * The entr object does not have many of the foreign key
      attributes: gloss.gloss, xref.xref, <anything>.entr, etc.
      However, it does have rdng.rdng, kanj.kanj, and sens.sens
      attributes since these are required when adding restr,
      stagr, stagk, and freq objects.
        """
        XKW, KW = self.XKW, self.KW

        entr = jdb.Entr()

        if not seq:
            elemseq = elem.find('ent_seq')
            if elemseq is None: raise ParseError("No <ent_seq> element found")
            try:
                seq = int(elemseq.text)
            except ValueError:
                raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text)
        if seq <= 0:
            raise ParseError("Invalid 'ent_seq' value, '%s'" % elem.text)
        entr.seq = seq

        id = elem.get('id')
        if id is not None: entr.id = int(id)
        dfrm = elem.get('dfrm')
        if dfrm is not None: entr.dfrm = int(dfrm)
        stat = elem.get('status') or jdb.KW.STAT['A'].id
        try:
            stat = XKW.STAT[stat].id
        except KeyError:
            raise ParseError("Invalid <status> element value, '%s'" % stat)
        entr.stat = stat
        entr.unap = elem.get('appr') == 'n'

        corpname = elem.findtext('ent_corp')
        if corpname is not None: entr.src = corp_dict[corpname].id
        fmap = defaultdict(lambda: ([], []))
        self.do_kanjs(elem.findall('k_ele'), entr, fmap)
        self.do_rdngs(elem.findall('r_ele'), entr, fmap)
        if fmap:
            freq_errs = jdb.make_freq_objs(fmap, entr)
            for x in freq_errs:
                typ, r, k, kw, val = x
                kwstr = XKW.FREQ[kw].kw + str(val)
                self.freq_warn(typ, r, k, kwstr)
        self.do_senss(elem.findall('sense'), entr, xlit, xlang)
        self.do_senss(elem.findall('trans'), entr, xlit, xlang)
        self.do_info(elem.findall("info"), entr)
        self.do_audio(elem.findall("audio"), entr, jdb.Entrsnd)
        self.do_groups(elem.findall("group"), entr, grpdefs)
        return entr