Esempio n. 1
0
def mkxrslv(idxlist):
    # Convert the $@indexlist that was created by bparse() into a
    # list of database xrslv table records.  The fk fields "entr"
    # and "sens" are not set in the xrslv records; they are set
    # by setids() just prior to writing to the database.

    res = []
    for ktxt, rtxt, senslst, note, prio in idxlist:
        if senslst:
            # A list of explicit sens were give in the B line,
            # create an xrslv record for each.
            res.extend([
                jdb.Obj(ktxt=ktxt,
                        rtxt=rtxt,
                        tsens=s,
                        typ=KW.XREF_uses,
                        notes=note,
                        prio=prio) for s in senslst
            ])
        else:
            # This is not list of senses so this cross-ref will
            # apply to all the target's senses.  Don't set a "sens"
            # field in the xrslv record which will result in a NULL
            # in the database record.
            res.append(
                jdb.Obj(ktxt=ktxt,
                        rtxt=rtxt,
                        typ=KW.XREF_uses,
                        notes=note,
                        prio=prio))
    for n, r in enumerate(res):
        r.ord = n + 1
    return res
Esempio n. 2
0
def main(args, opts):
    global Opts
    Opts = opts
    global Char
    Char = ''
    global Lineno
    Lineno = 1
    global KW

    jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    if opts.l: opts.l = open(opts.l, "w", encoding=opts.e)
    else: opts.l = sys.stderr
    if not opts.o:
        fn = (os.path.split(args[0]))[1]
        fn = (os.path.splitext(fn))[0]
        opts.o = fn + ".pgi"
    elif opts.o == "-":
        opts.o = None
    if opts.g:
        langs = [KW.LANG[iso639_1_to_2[x]].id for x in opts.g.split(',')]
    else:
        langs = None
    workfiles = pgi.initialize(opts.t)
    srcdate = parse_xmlfile(args[0], 4, workfiles, opts.b, opts.c, langs)
    srcrec = jdb.Obj(id=4,
                     kw='kanjidic',
                     descr='kanjidic2.xml',
                     dt=srcdate,
                     seq='seq_kanjidic',
                     srct=KW.SRCT['kanjidic'].id)
    pgi.wrcorp(srcrec, workfiles)
    pgi.finalize(workfiles, opts.o, not opts.k)
    print("\nDone!", file=sys.stderr)
Esempio n. 3
0
 def test001(_):
     a = [3, 4, 5]
     b = jdb.Obj(x=a, y=a)
     b2 = serialize.unserialize(serialize.serialize(b))
     _.assertEqual(a, b2.x)
     _.assertEqual(b2.x, b2.y)
     _.assertEqual(id(b2.x), id(b2.y))
Esempio n. 4
0
def mkxref(v):
    # If there is no tsens, generate an xref to only the first
    # sense.  Rationale: Revs prior to ~2018-06-07 we generated
    # xrefs to all senses in this scenario.  When there were a
    # lot of reverse xrefs to a word from the Example sentences,
    # every sense of the target word would have them all repeated.
    # However unless there is only one target sense, we can be
    # sure we are wrong: if the senses were so similar to be
    # interchangable they wouldn't be separate senses.  Since
    # we'll be wrong either way and someone will need to manually
    # correct it later, choose the way that produces the least
    # amount of clutter in the entry.  Also, in many cases the
    # first *will* be the right sense.
    nosens = False
    if not v.tsens:
        if v.nsens != 1:
            L('multiple senses').warning("using sense 1: %s" % (fs(v)))
            nosens = True
        v.tsens = 1
    if v.tsens > v.nsens:
        L('sense number too big').error(fs(v))
        return None
    xref = jdb.Obj(entr=v.entr,
                   sens=v.sens,
                   xref=v.ord,
                   typ=v.typ,
                   xentr=v.targ,
                   xsens=v.tsens,
                   rdng=v.rdng,
                   kanj=v.kanj,
                   notes=v.notes,
                   nosens=nosens,
                   lowpri=not v.prio)
    return xref
Esempio n. 5
0
 def test002(_):
     a1 = [3, 4, 5]
     a2 = [3, 4, 5]
     b = jdb.Obj(x=a1, y=a2)
     b2 = serialize.unserialize(serialize.serialize(b))
     _.assertEqual(a1, b2.x)
     _.assertEqual(b2.x, b2.y)
     _.assertNotEqual(id(b2.x), id(b2.y))
Esempio n. 6
0
def js2so (soj):
        # Convert a json-serialized SearchItems object back to an
        # object.  For convenience, we don't restore it to a SearchItem
        # but to an Obj.  SearchItem's purpose is to prevent adding
        # unexpected attributes, something we don't have to worry about
        # here since we're receiving one that was already checked.
        # 'soj' is a serialized SearchItems object to be restored.

        js = json.loads (soj)
        obj = jdb.Obj()
        obj.__dict__ = js
        sis = []
        for si in js.get ('txts', []):
            o = jdb.Obj()
            o.__dict__ = si
            sis.append (o)
        if sis: obj.txts = sis
        return obj
Esempio n. 7
0
def labels_from_file(fname):
    rs = []
    f = open(fname, 'r', encoding='utf_8_sig')
    for line in f:
        s, e, trns = line.split('\t')
        strt = int(float(s) * 100)
        end = int(float(e) * 100)
        rs.append(jdb.Obj(strt=strt, leng=end - strt, trns=trns.strip()))
    f.close()
    return rs
Esempio n. 8
0
def initialize(tmpdir):
    data = (
        ('kwsrc', [
            'id', 'kw', 'descr', 'dt', 'notes', 'seq', 'sinc', 'smin', 'smax',
            'srct'
        ]),
        ('kwgrp', ['id', 'kw', 'descr']),
        ('entr',
         ['id', 'src', 'stat', 'seq', 'dfrm', 'unap', 'srcnote', 'notes']),
        ('kanj', ['entr', 'kanj', 'txt']),
        ('kinf', ['entr', 'kanj', 'ord', 'kw']),
        ('rdng', ['entr', 'rdng', 'txt']),
        ('rinf', ['entr', 'rdng', 'ord', 'kw']),
        ('restr', ['entr', 'rdng', 'kanj']),
        ('freq', ['entr', 'rdng', 'kanj', 'kw', 'value']),
        ('sens', ['entr', 'sens', 'notes']),
        ('gloss', ['entr', 'sens', 'gloss', 'lang', 'ginf', 'txt']),
        ('pos', ['entr', 'sens', 'ord', 'kw']),
        ('misc', ['entr', 'sens', 'ord', 'kw']),
        ('fld', ['entr', 'sens', 'ord', 'kw']),
        ('dial', ['entr', 'sens', 'ord', 'kw']),
        ('lsrc', ['entr', 'sens', 'ord', 'lang', 'txt', 'part', 'wasei']),
        ('stagr', ['entr', 'sens', 'rdng']),
        ('stagk', ['entr', 'sens', 'kanj']),
        ('xref', ['entr', 'sens', 'xentr', 'xsens', 'typ', 'notes']),
        ('xresolv', [
            'entr', 'sens', 'typ', 'ord', 'rtxt', 'ktxt', 'tsens', 'notes',
            'prio'
        ]),
        ('hist', [
            'entr', 'hist', 'stat', 'unap', 'dt', 'userid', 'name', 'email',
            'diff', 'refs', 'notes'
        ]),
        ('grp', ['entr', 'kw', 'ord', 'notes']),
        ('chr', [
            'entr', 'chr', 'bushu', 'strokes', 'freq', 'grade', 'jlpt',
            'radname'
        ]),
        ('cinf', ['entr', 'kw', 'value']),
        ('kresolv', ['entr', 'kw', 'value']),
        ('sndvol', ['id', 'title', 'loc', 'type', 'idstr', 'corp', 'notes']),
        ('sndfile', ['id', 'vol', 'title', 'loc', 'type', 'notes']),
        ('snd', ['id', 'file', 'strt', 'leng', 'trns', 'notes']),
        ('entrsnd', ['entr', 'ord', 'snd']),
        ('rdngsnd', ['entr', 'rdng', 'ord', 'snd']),
    )

    workfiles = {}
    for n, (t, v) in enumerate(data):
        fn = "%s/_jm_%s.tmp" % (tmpdir, t)
        workfiles[t] = jdb.Obj(ord=n, tbl=t, file=None, fn=fn, cols=v)
    return workfiles
Esempio n. 9
0
def d2o(dict_):
    # Copy the key/value items in a dict to attributes on an
    # object, converting numbers to ints when possible.
    # FIXME: What about floats, bools, datetimes, lists, ...?
    #  Should we consider JSON as an ini file format?
    o = jdb.Obj()
    for k, v in list(dict_.items()):
        try:
            v = int(v)
        except (ValueError, TypeError):
            pass
        setattr(o, k, v)
    return o
Esempio n. 10
0
 def do_corpus(self, elem):
     o = jdb.Obj(id=int(elem.get('id')), kw=elem.findtext('co_name'))
     descr = elem.findtext('co_descr')
     if descr: o.descr = descr
     dt = elem.findtext('co_date')
     if dt: o.dt = dt
     notes = elem.findtext('co_notes')
     if notes: o.notes = notes
     sname = elem.findtext('co_sname')
     if sname: o.seq = sname
     sinc = elem.findtext('co_sinc')
     if sinc: o.sinc = sinc
     smin = elem.findtext('co_smin')
     if smin: o.smin = smin
     smax = elem.findtext('co_smax')
     if smax: o.smax = smax
     return o
Esempio n. 11
0
def mkxrefs(v, e):
    global Prev
    cntr = 1 + (Prev.xref if Prev else 0)
    xrefs = []
    for s in range(1, e[6] + 1):

        # If there was a sense number given in the xresolv
        # record (field "tsens") then step through the
        # senses until we get to that one and generate
        # an xref only for it.  If there is no tsens,
        # generate an xref for every sense.
        if v.tsens and v.tsens != s: continue

        # The db xref records use column "xref" as a order
        # number and to distinguish between multiple xrefs
        # in the same entr/sens.  We use cntr to maintain
        # its value, and it is reset to 1 here whenever we
        # see an xref record with a new entr or sens value.
        if not Prev or Prev.entr != v.entr \
                    or Prev.sens != v.sens:
            cntr = 1
        xref = jdb.Obj(entr=v.entr,
                       sens=v.sens,
                       xref=cntr,
                       typ=v.typ,
                       xentr=e[0],
                       xsens=s,
                       rdng=e[2],
                       kanj=e[3])
        cntr += 1
        Prev = xref
        xrefs.append(xref)

    if not xrefs:
        if v.tsens: msg(fs(v), "Sense not found", kr(v))
        else: raise ValueError("No senses in retrieved entry!")

    return xrefs
Esempio n. 12
0
 def do_grpdef(self, elem):
     o = jdb.Obj(id=int(elem.get('id')), kw=elem.findtext('gd_name'))
     descr = elem.findtext('gd_descr')
     if descr: o.descr = descr
     return o
Esempio n. 13
0
def parse_corpus_opt(sopt, roottag, datestamp, srctarg=None, kw=None):
    """
        Return a corpus id number to use in entr.src and (possibly)
        create a corpus (aka kwsrc) record in the output .pgi file.
        A kwsrc record has seven fields: 'id' (id number), 'kw'
        (keyword), 'dt' (datetime stamp), 'sincr' (increment to
        use for the Postgresql sequence for this corpus it it needs
        to be created), 'smin' (minimum sequence value), 'smax'
        (maximum sequence value), and srct (id number of row in kwsrct).
        We derive four fields from information in the 'sopt' string,
        the 'roottag' string, and the 'datestamp' string parameters.
        'sopt' is contains one to four comma separated fields as
        decribed in the help message for the (-s, --corpus) option.
        'srct' is typically (in the caller) hardwired to the appropriate
        type id (from KW.SRCT).

        [N.B. the kwsrc table also has two other columns, 'descr' and
        'notes' but this function has no provision for setting their
        values.  They can be set explicitly outside this function, or
        updated in the database table after kwsrc is loaded.]

        The procedure is:
         - If no sopt string is given:
            - If 'roottag' is "jmdict", "jmnedict" or "kanjidict2" use
              1, 2 or 4 respectively as the 'id' value, 'roottag' as the
              'kw' value, 'datestamp' as the 'dt' value and "jmdict_seq",
              "jmnedict_seq" or "kanjidic_seq" respectively as the 'seq'
              value, and, if srctarg is None, 1, 2 or 4 respectively as
              the 'srct' value.  If 'srctarg' is not None, use its value
              for 'srct.
            - If roottag is not one of those three values, raise an error.
         - If sopt was given then,
            - Use the first field as the corpus id number.
            - If the first field is the only field, no kwsrc record
              will be generated in the pgi file; it is expected that
              a kwsrc record with the corpus id number already exist
              in the kwsrc table when the data in loaded into the
              database.
            - If there is more than one field, they will be used to
              create a kwsrc record.  If 'kw' is missing, 'roottag' will
              be used as 'kw'.  If 'roottag' is also false, and error is
              raised.  If 'dt' is missing, 'datestamp' will be used.
              If 'seq' is missing, the 'kw' value affixed with "-seq" 
              is used.  If 'srct' is missing, 'srctarg' will be used.
        """
    corpid = corpnm = corpdt = corpseq = None
    sinc = 10
    smin = None
    smax = None
    srct = None
    if sopt:
        a = sopt.split(',')
        # FIXME: no or non-int a[0] raises IndexError or ValueError.
        #   Should we raise something more informative and specific?
        corpid = int(a[0])
        if len(a) == 1:
            return corpid, None
        if len(a) > 1 and a[1]: corpnm = a[1]
        if len(a) > 2 and a[2]: corpdt = a[2]
        if len(a) > 3 and a[3]: sinc = int(a[3])
        if len(a) > 4 and a[4]: smin = int(a[4])
        if len(a) > 5 and a[5]: smax = int(a[5])
        if len(a) > 6 and a[6]: srct = int(a[6])

    if not corpnm: corpnm = roottag.lower()
    if not corpid:
        # FIXME: unknown roottag raises KeyError.  Should we raise something
        #   more informative and specific?
        # FIXME: we really shouldn't assign fixed values to the id numbers.
        corpid = {
            'jmdict': 1,
            'jmnedict': 2,
            'examples': 3,
            'kanjidic': 4
        }[corpnm]
    if not corpdt: corpdt = datestamp
    corpseq = "seq_" + corpnm
    if corpid == 1:
        if not smin: smin = 1000000
        if not smax: smax = 8999999
    if not srct: srct = srctarg
    if not srct: srct = corpnm
    if kw: srct = kw.SRCT[srct].id
    if not isinstance(srct, int):
        raise ValueError("'srct' must be a int, is '%r'" % srct)
    return corpid, jdb.Obj(id=corpid,
                           kw=corpnm,
                           dt=corpdt,
                           seq=corpseq,
                           sinc=sinc,
                           smin=smin,
                           smax=smax,
                           srct=srct)
Esempio n. 14
0
# warning written to the log file but parsing continues.  If the
# version number is greatly older or newer than KANJIDIC_VERSION
# parsing may fail but generaly it is not a problem and the different
# versions will still be parsed successfully.
KANJIDIC_VERSION = '2017-123'

# Remap the keywords used in the kanjidic2.xml file to
# the keywords used in the kw* tables.  Those keywords
# not mentioned below have the same text in both places.
Xml2db = jdb.Obj(RINF={
    'nanori': 'name',
    'jy': 'jouyou',
    'ja_kun': 'kun',
    'ja_on': 'on',
    'kan\'you': 'kanyou'
},
                 LANG=iso639_1_to_2,
                 CINF={
                     'kanji_in_context': 'kanji_in_ctx',
                     'kodansha_compact': 'kodansha_comp',
                     'skip_misclass': 'skip_mis',
                     'stroke_count': 'strokes'
                 })


def main(args, opts):
    global Opts
    Opts = opts
    global Char
    Char = ''
    global Lineno
    Lineno = 1