Esempio n. 1
0
def main(args, opts):
    jdb.reset_encoding(sys.stdout, opts.encoding)
    dir = jdb.find_in_syspath("dtd-audio.xml")
    dtd = jdb.get_dtd(dir + "/" + "dtd-audio.xml", "JMaudio", opts.encoding)
    print(dtd)
    print("<JMaudio>")
    cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts))
    vols = jdb.dbread(cur, "SELECT * FROM sndvol")
    for v in vols:
        print("\n".join(fmtxml.sndvols([v])))
        sels = jdb.dbread(cur, "SELECT * FROM sndfile s WHERE s.vol=%s",
                          [v.id])
        for s in sels:
            print("\n".join(fmtxml.sndsels([s])))
            clips = jdb.dbread(cur, "SELECT * FROM snd c WHERE c.file=%s",
                               [s.id])
            for c in clips:
                print("\n".join(fmtxml.sndclips([c])))
    print('</JMaudio>')
Esempio n. 2
0
def entr(entr, xslfile=None, xslt=[], want_utf8=False):
    # A slow but simple way to get an Edict2 formatted text for an entry.
    # entr -- A jmdictdb Entr object, or a string containing the xml
    #   of an Entr object, or None.
    # xslfile -- Name of an xslt file.  If the name contains any path
    #   separator characters, it will be used as is.  Otherwise is it
    #   will be taken as a plain filename and searched for on the Python
    #   search path (sys.path).  Either way, the resulting file is
    #   will be converted to a lxml .etree.XSLT transform object and
    #   applied the the xml from 'entr' (if 'entr' was not None.)
    # xslt -- May be None, an empty list, or a list of one item which
    #   is a lxml.etree.XSLT transform object that will be applied to
    #   in 'entr' xml.  If an empty list, the xslt file given 'xslfile'
    #   will be converted to a transform and saved in it (for use in
    #   subsequent calls).  If None, 'xslfile' will be converted to a
    #   transform and not saved.
    # want_utf8 -- If false, a unicode text string is returned.  If
    #   true, a utf-8 encoded text string is returned.

    if not xslt:
        if not xslfile: xslfile = 'edict2.xsl'
        # Read the xsl file.
        if '/' not in xslfile and '\\' not in xslfile:
            dir = jdb.find_in_syspath(xslfile)
            xslfile = dir + '/' + xslfile
        xsldoc = lxml.etree.parse(xslfile)
        # Generate a transform, and use the default value
        # of the 'xslt' parameter to cache it.
        xslt[:] = [lxml.etree.XSLT(xsldoc)]
    edicttxt = None
    if entr:
        if not isinstance(entr, str):
            xml = fmtxml.entr(entr, compat='jmdict')
        else:
            xml = entr
        # Replace entities.
        xml = re.sub(r'&([a-zA-Z0-9-]+);', r'\1', xml)
        xml = "<JMdict>%s</JMdict>" % xml
        # Apply the xsl to the xml, result is utf-8 encoded.
        edicttxt = str(xslt[0](etree.parse(StringIO(xml)))).rstrip('\n\r')
        if want_utf8:  # Convert to utf-8 to unicode.
            edicttxt = edicttxt.encode('utf-8')
    return edicttxt
Esempio n. 3
0
def main(args, opts):
    global Debug
    Debug = opts.debug
    # Open the database.  jdb.dbopts() extracts the db-related
    # options from the command line options in 'opts'.
    cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts))

    # If no "--root" option was supplied, choose a default based
    # on the value of the "--compat" option.
    if not opts.root:
        if opts.compat in ('jmnedict', 'jmneold'): opts.root = 'JMnedict'
        else: opts.root = 'JMdict'

    outf = None
    if not opts.nodtd:
        # Choose a dtd to use based on the "--compat" option.
        # The dtd file is expected to be located somewhere in the
        # pythonpath (sys.path) directories.
        if opts.compat == 'jmdict': dtd = "dtd-jmdict.xml"
        elif opts.compat == 'jmdicthist': dtd = "dtd-jmdict.xml"
        elif opts.compat == 'jmnedict': dtd = "dtd-jmnedict.xml"
        elif opts.compat == 'jmneold': dtd = "dtd-jmneold.xml"
        else: dtd = "dtd-jmdict-ex.xml"
        dir = jdb.find_in_syspath(dtd)
        dtdfn = dir + "/" + dtd  # Fully qualified dtd file name.

        # jdb.get_dtd() reads the dtd text, and replaces the root
        # element name name and encoding with the values supplied
        # in the arguments.
        dtdtxt = jdb.get_dtd(dtdfn, opts.root, opts.encoding)
        if len(args) == 0: outf = sys.stdout
        else: outf = open(args[0], "w")
        jdb.reset_encoding(outf, opts.encoding)
        outf.write(dtdtxt)

    if opts.seqfile:
        if opts.seqfile == '-': f = sys.stdin
        else: f = open(opts.seqfile)
        #FIXME: we should read these incrementally.
        entrlist = [int(x)
                    for x in f.read().split()]  # seq# separated by sp or nl.
        if f != sys.stdin: f.close()

    # Turn the "--corpus" option value into a string that can be
    # and'ed into a SQL WHERE clause to restrict the results to
    # the specified corpora.
    corp_terms = parse_corpus_opt(opts.corpus, 'e.src')

    # If the output file was not opened in the dtd section
    # above, open it now.  We postpose opening it until the
    # last possible moment to avoid creating it and then
    # bombing because there was a typo in the input or dtd
    # filename, etc.
    # FIXME: Should do a "write" function that opens the
    #  file just before writing.
    if not outf:
        if len(args) == 0: outf = sys.stdout
        else: outf = open(args[0], "w")

    whr_act = " AND NOT unap AND stat=" + str(
        jdb.KW.STAT['A'].id) if opts.compat else ""
    if opts.begin:
        # If a "--begin" sequence number was given, we need to read
        # the entr record so we can get the src id number.  Complain
        # and exit if not found.  Complain if more than one entry
        # with the requested seq number exists.  More than one may be
        # found since the same sequence number may exist in different
        # corpora, or in the same corpus if an entry was edited.
        #
        #FIXME: no way to select from multiple entries with same seq
        # number.  Might want just the stat="A" entries for example.
        sql = "SELECT id,seq,src FROM entr e WHERE seq=%s%s%s ORDER BY src" \
                % (int(opts.begin), corp_terms, whr_act)
        if Debug: print(sql, file=sys.stderr)
        start = time.time()
        rs = jdb.dbread(cur, sql)
        if Debug:
            print("Time: %s (init read)" % (time.time() - start),
                  file=sys.stderr)
        if not rs:
            print ("No entry with seq '%s' found" \
                                 % opts.begin, file=sys.stderr)
            sys.exit(1)
        if len(rs) > 1:
            print ("Multiple entries having seq '%s' found, results " \
                   "may not be as expected.  Consider using -s to " \
                   "restrict to a single corpus." % (opts.begin), file=sys.stderr)
        lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id
    if not opts.begin and not opts.seqfile:
        # If no "--begin" option, remove the " AND" from the front of
        # the 'corp_terms' string.  Read the first entry (by seq number)
        # in the requested corpora.
        cc = corp_terms[4:] if corp_terms else 'True'
        # If compat (jmdict or jmnedict), restrict the xml to Active
        # entries only.
        sql = "SELECT id,seq,src FROM entr e WHERE %s%s ORDER BY src,seq LIMIT 1" % (
            cc, whr_act)
        start = time.time()
        if Debug: print(sql, file=sys.stderr)
        rs = jdb.dbread(cur, sql)
        if Debug:
            print("Time: %s (init read)" % (time.time() - start),
                  file=sys.stderr)
        lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id

    # Add an enclosing root element only if we are also including
    # a DTD (ie, producing a full XML file).  Otherwise, the file
    # generated will just be a list of <entr> elements.
    if not opts.nodtd:
        if opts.compat:  # Add a date comment...
            today = time.strftime("%Y-%m-%d", time.localtime())
            outf.write("<!-- %s created: %s -->\n" % (opts.root, today))
        outf.write('<%s>\n' % opts.root)

    entrlist_loc = 0
    count = opts.count
    done = 0
    blksize = opts.blocksize
    corpora = set()

    while count is None or count > 0:

        if opts.seqfile:
            seqnums = tuple(entrlist[entrlist_loc:entrlist_loc + blksize])
            if not seqnums: break
            entrlist_loc += blksize
            #FIXME: need detection of non-existent seq#s.
            sql = "SELECT id FROM entr e WHERE seq IN %s" + corp_terms + whr_act
            sql_args = [seqnums]
            if Debug: print(sql, sql_args, file=sys.stderr)
            start = time.time()
            tmptbl = jdb.entrFind(cur, sql, sql_args)
        else:
            # In this loop we read blocks of 'blksize' entries.  Each
            # block read is ordered by entr src (i.e. corpus), seq, and
            # id.  The block to read is specified in WHERE clause which
            # is effectively:
            #   WHERE ((e.src=lastsrc AND e.seq=lastseq AND e.id>=lastid+1)
            #           OR (e.src=lastsrc AND e.seq>=lastseq)
            #           OR e.src>lastsrc)
            # and (lastsrc, lastseq, lastid) are from the last entry in
            # the last block read.

            whr = "WHERE ((e.src=%%s AND e.seq=%%s AND e.id>=%%s) " \
                          "OR (e.src=%%s AND e.seq>%%s) " \
                          "OR e.src>%%s) %s%s" % (corp_terms, whr_act)
            sql = "SELECT e.id FROM entr e" \
                  " %s ORDER BY src,seq,id LIMIT %d" \
                   % (whr, blksize if count is None else min (blksize, count))

            # The following args will be substituted for the "%%s" in
            # the sql above, in jbd.findEntr().
            sql_args = [lastsrc, lastseq, lastid, lastsrc, lastseq, lastsrc]

            # Create a temporary table of id numbers and give that to
            # jdb.entrList().  This is an order of magnitude faster than
            # giving the above sql directly to entrList().
            if Debug: print(sql, sql_args, file=sys.stderr)
            start = time.time()
            tmptbl = jdb.entrFind(cur, sql, sql_args)
        mid = time.time()
        entrs, raw = jdb.entrList(cur,
                                  tmptbl,
                                  None,
                                  ord="src,seq,id",
                                  ret_tuple=True)
        end = time.time()
        if Debug: print("read %d entries" % len(entrs), file=sys.stderr)
        if Debug:
            print("Time: %s (entrFind), %s (entrList)" %
                  (mid - start, end - mid),
                  file=sys.stderr)
        if not entrs: break
        write_entrs(cur, entrs, raw, corpora, opts, outf)

        # Update the 'last*' variables for the next time through
        # the loop.  Also, decrement 'count', if we are counting.
        lastsrc = entrs[-1].src
        lastseq = entrs[-1].seq
        lastid = entrs[-1].id + 1
        if count is not None: count -= blksize
        done += len(entrs)
        if not Debug: sys.stderr.write('.')
        else: print("%d entries written" % done, file=sys.stderr)
    if not opts.nodtd: outf.writelines('</%s>\n' % opts.root)
    if not Debug: sys.stderr.write('\n')
    print("Wrote %d entries" % done, file=sys.stderr)