Esempio n. 1
0
def main (args, opts):
        global msg
        global Opts; Opts = opts
        global KW; jdb.KW = KW = jdb.Kwds (jdb.std_csv_dir())

          # Create a globally accessible function, msg() that has
          # has 'logfile' and 'opts.verbose' already bound and
          # which will be called elsewhere when there is a need to
          # write a message to the logfile.
        logfile = sys.stderr
        if opts.logfile:
            logfile = open (opts.logfile, "w", encoding=opts.encoding)
        def msg (message): _msg (logfile, opts.verbose, message)

        fin = ABPairReader (args[0], encoding='utf-8')
          # FIXME: following gives localtime, change to utc or lt+tz.
        mtime = datetime.date.fromtimestamp(os.stat(args[0])[8])
        corpid, corprec \
            = pgi.parse_corpus_opt (opts.corpus, "examples", mtime, KW.SRCT['examples'].id)
        tmpfiles = pgi.initialize (opts.tempdir)
        if not opts.noaction:
            tmpfiles = pgi.initialize (opts.tempdir)
            if corprec: pgi.wrcorp (corprec, tmpfiles)
        for eid, entr in enumerate (parse_ex (fin, opts.begin)):
            if not opts.noaction:
                entr.src = corpid
                jdb.setkeys (entr, eid+1)
                pgi.wrentr (entr, tmpfiles)
            if not (eid % 2000):
                sys.stdout.write ('.'); sys.stdout.flush()
            if opts.count and eid+1 >= opts.count: break
        sys.stdout.write ('\n')
        if not opts.noaction: pgi.finalize (tmpfiles, opts.output, not opts.keep)
Esempio n. 2
0
def main(args, opts):
    global KW

    if opts.database:
        jdb.dbOpen(opts.database, **jdb.dbopts(opts))
        KW = jdb.KW
    else:
        jdb.KW = KW = jdb.Kwds(jdb.std_csv_dir())

    xlang = None
    if opts.lang:
        xlang = [KW.LANG[x].id for x in opts.lang.split(',')]

    #FIXME: we open the xml file with utf-8 encoding even though
    # its encoding may be given within the file and may be different.
    inpf = jmxml.JmdictFile(open(args[0], encoding='utf-8'))
    tmpfiles = pgi.initialize(opts.tempdir)
    if not opts.logfile: logfile = sys.stderr
    else: logfile = open(opts.logfile, "w", encoding=opts.encoding)
    eid = 0
    jmparser = jmxml.Jmparser(KW, logfile=logfile)
    for typ, entr in jmparser.parse_xmlfile(inpf,
                                            opts.begin,
                                            opts.count,
                                            opts.extract,
                                            xlang,
                                            toptag=True,
                                            seqnum_init=opts.sequence[0],
                                            seqnum_incr=opts.sequence[1]):
        if typ == 'entry':
            eid += 1
            if not ((eid - 1) % 1800):
                sys.stdout.write('.')
                sys.stdout.flush()
                logfile.flush()
            if not getattr(entr, 'src', None): entr.src = corpid
            jdb.setkeys(entr, eid)
            pgi.wrentr(entr, tmpfiles)
        elif typ == 'corpus':
            pgi.wrcorp(entr, tmpfiles)
        elif typ == 'grpdef':
            pgi.wrgrpdef(entr, tmpfiles)
        elif typ == 'root':
            # Note that 'entr' here is actually the tag name of the
            # top-level element in the xml file, typically either
            # "JMdict" or "JMnedict".
            try:                corpid, corprec \
                   = pgi.parse_corpus_opt (opts.corpus, entr, inpf.created, kw=KW)
            except KeyError:
                pass
            else:
                if corprec: pgi.wrcorp(corprec, tmpfiles)

    sys.stdout.write('\n')
    pgi.finalize(tmpfiles, opts.output, not opts.keep)
Esempio n. 3
0
def parse_xmlfile(infn, srcid, workfiles, start, count, langs):

    global Lineno

    # Use the ElementTree module to parse the jmdict
    # xml file.  This function keeps track of where
    # we are and for each parsed <entry> element, calls
    # do_entry() to actually build a runtime representation
    # of the entry, and then write_entry() to do the actual
    # writing to the database.

    inpf = LnFile(open(infn, encoding='utf-8'))
    context = iter(ElementTree.iterparse(inpf, ("start", "end")))
    event, root = next(context)
    if start and start > 1:
        print("Skipping initial entries...", file=sys.stderr)
    cntr = 0
    for event, elem in context:

        # We get here every time a tag is opened (event
        # will be "start") or closed (event will be "end")
        # "elem" is an object containg the element which
        # will be empty when event is "start" and will contain
        # all the element's attributes and child elements
        # when event is "end".  elem.tag is the name of the
        # tag.

        if elem.tag == "character" and event == "start":

            # When we encounter a <character> tag, save the line
            # number, and increment the entry counter "cntr".

            Lineno = inpf.lineno  # For warning messages created by warn().

            # If we are skipping entries, cntr will be 0.
            # Otherwise, break if we have processed the
            # the number of entries requested in the -c
            # option.

            if cntr >= count: break

        if elem.tag == 'header' and event == 'end':
            xmldate = (elem.find('date_of_creation')).text
            if (elem.find ('file_version')).text != '4' or \
               (elem.find ('database_version')).text != KANJIDIC_VERSION:
                warn('Kanjidic XML version is %s but we expected %s.'
                     '\nThis program may or may not work on this file.' %
                     (elem.find('database_version').text, KANJIDIC_VERSION))

        # Otherwise we are precessing characters so we want
        # to handle the <character> "end" events but we are
        # not interested in anything else.

        if elem.tag != "character" or event != "end": continue

        # If we haven't reached that starting line number
        # (given by the -b option) yet, then don't process
        # this entry, but we still need to clear the parsed
        # entry bofore continuing in order to avoid excessive
        # memory consumption.

        if Lineno >= start:

            # If this is the first entry processed (cnt0==0)
            # save the current entry counter value.

            cntr += 1
            if cntr == 1: print("Parsing...", file=sys.stderr)

            # Process and write this entry.

            entr = do_chr(elem, srcid, langs)
            jdb.setkeys(entr, cntr)
            pgi.wrentr(entr, workfiles)

            # A progress bar.  The modulo number is picked
            # to provide slightly less that 80 dots for a full
            # kanjidic2 file.

            if (cntr - 1) % 166 == 0: sys.stderr.write(".")

        # We no longer need the parsed xml info for this
        # item so dump it to reduce memory consumption.

        root.clear()

    return xmldate