Ejemplo n.º 1
0
def parse_kradfile(krfile, encoding='euc-jp'):
    """
    Parse KRADFILE & KRADFILE2
    These are typically EUC-JP (JIS X 0212) encoded files, but a different
    encoding can be specified with the 'encoding' parameter
    """
    global krdex
    logthis("Parsing file", suffix=krfile, loglevel=LL.INFO)

    # convert file from EUC-JP to Unicode
    with codecs.open(krfile, 'r', encoding) as f:
        krraw = f.read()

    # parse line-by-line
    kc = 0
    for tline in krraw.splitlines():
        # skip empty lines & comments
        if tline[0] == '#' or tline[0] == ' ':
            continue
        # split left/right
        try:
            rkan, rrads = tline.split(':')
        except:
            continue
        tkanji = rkan.strip()
        trads = rrads.strip().split()
        krdex[tkanji] = trads
        kc += 1

    logthis("** KRADFILE Kanji parsed:", suffix=kc, loglevel=LL.INFO)
Ejemplo n.º 2
0
def fetch(url,outfile,encoding='utf-8'):
    """
    fetch a file from a URL and save it; gzip'd files will be decoded automagically
    """
    ofrp = os.path.realpath(os.path.expanduser(outfile))
    logthis("Fetching:",suffix=url,loglevel=LL.INFO)

    # retrieve file and decompress it on-the-fly (as long as it has a Content-Encoding gzip header)
    try:
        r = requests.get(url)
        r.raise_for_status()
    except Exception as e:
        logexc(e, "Failed to retrieve file")
        return False

    # coerce to the correct encoding (either UTF-8 or EUC-JP... usually)
    r.encoding = encoding

    try:
        with codecs.open(ofrp,'w',encoding) as f:
            f.write(r.text)
    except Exception as e:
        logexc(e, "Failed to write output to %s" % (outfile))
        return False

    logthis(">> Wrote output to",suffix=ofrp,loglevel=LL.INFO)
    return True
Ejemplo n.º 3
0
def update_set(mdx, indata, setname):
    """
    merge each existing entry with new entry
    """
    updated = 0
    created = 0

    logthis(">> Updating collection:", suffix=setname, loglevel=LL.INFO)

    for tk, tv in indata.iteritems():
        xkan = mdx.findOne(setname, {'_id': tk})
        if xkan:
            # modify existing object with new data
            iobj = xkan
            iobj.update(tv)
        else:
            iobj = tv

        if mdx.upsert(setname, tk, iobj)['updatedExisting']:
            updated += 1
        else:
            created += 1

    logthis("update complete - updated: %d / created: %d / total:" %
            (updated, created),
            prefix=setname,
            suffix=(updated + created),
            loglevel=LL.INFO)
Ejemplo n.º 4
0
def getRelated(neo, klist, inkanji, limit=20):
    """
    get related kanji for input (inkanji)
    pass in a dict of all kanji (klist), and a Neo4j Graph object (neo)
    returns a dict of top (limit) related kanji
    """
    cto = KCounter()
    cto.clear()
    logthis("** Kanji:", suffix=inkanji, loglevel=LL.VERBOSE)

    # match kanji with radicals in the same position
    radmat = neo.cypher.execute(
        u"MATCH (k:Kanji)-[e:CONTAINS]-(r:Radical)-[e2:CONTAINS]-(k2:Kanji) WHERE k.kanji = '%s' AND e.position = e2.position RETURN k2.kanji AS kanji, r.radical AS radical, e.position AS position"
        % (inkanji))
    cto.absorb(radmat, score=5)

    # match kanji with radicals in any position
    radmat = neo.cypher.execute(
        u"MATCH (k:Kanji)-[e:CONTAINS]-(r:Radical)-[:CONTAINS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.radical AS radical, e.position AS position"
        % (inkanji))
    cto.absorb(radmat, score=2)

    # match kanji with same SKIP code
    radmat = neo.cypher.execute(
        u"MATCH (k:Kanji)-[:WRITTEN]-(r:Skip)-[:WRITTEN]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.skip AS skip"
        % (inkanji))
    cto.absorb(radmat, score=3)

    # match kanji with same meaning/sense keywords
    radmat = neo.cypher.execute(
        u"MATCH (k:Kanji)-[:MEANS]-(r:Sense)-[:MEANS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.sense AS sense"
        % (inkanji))
    cto.absorb(radmat, score=4)

    # match kanji with same readings
    radmat = neo.cypher.execute(
        u"MATCH (k:Kanji)-[:READS]-(r:Reading)-[:READS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.reading AS reading"
        % (inkanji))
    cto.absorb(radmat, score=1)

    # get top related
    rtop = cto.sorted()[:limit]
    okan = {}
    for tt in rtop:
        # get crossref data for this kanji
        ttid = u'%x' % (ord(tt[0]))
        tto = klist.get(ttid)
        if tto:
            tjoyo = tto.get('grade', None)
            tjdex = tto.get('jindex', None)
        else:
            tjoyo = None
            tjdex = None
        okan[tt[0]] = {'score': tt[1], 'joyo': tjoyo, 'jindex': tjdex}
        logthis("-- %s -> %s" % (tt[0], json.dumps(okan[tt[0]])),
                loglevel=LL.DEBUG)

    return okan
Ejemplo n.º 5
0
def update_mongo(mongo_uri, kdex, jmdict, nedict):
    """
    Insert, upsert, or update entries in MongoDB
    """
    # connect to mongo
    logthis("Connecting to", suffix=mongo_uri, loglevel=LL.INFO)
    mdx = mongo(mongo_uri)

    # Kanji
    update_set(mdx, kdex, 'kanji')

    # JMDict
    update_set(mdx, jmdict, 'jmdict')

    # JMnedict
    update_set(mdx, nedict, 'jmnedict')
Ejemplo n.º 6
0
def run(xconfig):
    """entry point"""
    # get output directory
    if xconfig.run.output:
        outpath = os.path.realpath(xconfig.run.output)
        if not os.path.exists(outpath):
            failwith(ER.NOTFOUND, "Specified directory not found")
        elif not os.path.isdir(outpath):
            failwith(ER.CONF_BAD, "Must specify a directory, not a file")

    logthis("Output directory:",suffix=outpath,loglevel=LL.INFO)

    # fetch files
    fails = 0
    for tf in fmanifest:
        tfo = re.match('^https?://.+/([^/]+)\.gz$', tf['url']).group(1)
        if not fetch(tf['url'], outpath+'/'+tfo, tf.get('encoding','utf-8')):
            fails += 1

    return fails
Ejemplo n.º 7
0
def run(xconfig):
    """
    compile lists of related Kanji using Neo4j
    """
    # check for extra options
    margs = xconfig.run.modargs

    # connect to Mongo
    mgx = mongo(xconfig.mongo.uri)

    # get all kanji from Mongo
    kset = mgx.find("kanji", {}, rdict=True)
    logthis("** Kanji objects:", suffix=len(kset), loglevel=LL.INFO)

    # connect to Neo4j
    try:
        neo = py2neo.Graph(xconfig.neo4j.uri)
        ncount = neo.cypher.execute(
            'MATCH (n) RETURN count(*) AS ncount')[0]['ncount']
    except Exception as e:
        logexc(e, "Failed to connect to Neo4j dataset")
        failwith(ER.PROCFAIL, "Unable to continue. Aborting.")

    logthis("** Nodes in Neo4j dataset:", suffix=ncount, loglevel=LL.INFO)

    # check through all kanji
    hasRelated = 0
    for tkan in kset:
        # get the top 20 highest-scoring related nodes
        trel = getRelated(neo, kset, kset[tkan]['kanji'])
        # update kanji entry in Mongo
        mgx.update_set("kanji", tkan, {'krelated': trel})
        if len(trel) > 0:
            hasRelated += 1

    logthis("** Complete. Kanji with krelated data:",
            suffix=hasRelated,
            loglevel=LL.INFO)
Ejemplo n.º 8
0
def run(xconfig):
    """
    build graph in Neo4j based on the Kanji dataset
    """
    # check for extra options
    margs = xconfig.run.modargs

    # connect to Mongo
    mgx = mongo(xconfig.mongo.uri)

    # get all kanji from Mongo
    kset = mgx.find("kanji", {})
    logthis("** Kanji objects:", suffix=len(kset), loglevel=LL.INFO)

    # connect to Neo4j
    try:
        neo = py2neo.Graph(xconfig.neo4j.uri)
        ncount = neo.cypher.execute('MATCH (n) RETURN count(*)')
    except Exception as e:
        logexc(e, "Failed to connect to Neo4j dataset")
        failwith(ER.PROCFAIL, "Unable to continue. Aborting.")

    # if 'clear' is passed as an extra parg, then drop all existing nodes/rels
    if 'clear' in margs:
        logthis("Deleting existing data...", loglevel=LL.INFO)
        neo.cypher.execute("MATCH (n) DETACH DELETE n")

    # create node constraints
    logthis("Creating constraints...", loglevel=LL.VERBOSE)
    neo.cypher.execute(
        "CREATE CONSTRAINT ON (k:Kanji) ASSERT k.kanji IS UNIQUE")
    neo.cypher.execute(
        "CREATE CONSTRAINT ON (r:Radical) ASSERT r.radical IS UNIQUE")
    neo.cypher.execute(
        "CREATE CONSTRAINT ON (s:Sense) ASSERT s.sense IS UNIQUE")
    neo.cypher.execute(
        "CREATE CONSTRAINT ON (r:Reading) ASSERT r.reading IS UNIQUE")
    neo.cypher.execute("CREATE CONSTRAINT ON (g:Joyo) ASSERT g.joyo IS UNIQUE")
    neo.cypher.execute("CREATE CONSTRAINT ON (g:Jlpt) ASSERT g.jlpt IS UNIQUE")
    neo.cypher.execute("CREATE CONSTRAINT ON (k:Skip) ASSERT k.skip IS UNIQUE")

    # Build nodes & relationships
    logthis("** Building graph...", loglevel=LL.INFO)
    for kk, tk in kset.iteritems():
        logthis(">>>------[ %5d ] Kanji node <%s> -----" % (kk, tk['kanji']),
                loglevel=LL.DEBUG)

        # Kanji
        try:
            freq = int(tk['freq'])
        except:
            freq = 0
        knode = py2neo.Node("Kanji",
                            kanji=tk['kanji'],
                            ucs=tk['_id'],
                            freq=freq)

        # Radicals
        xnodes = []
        xrels = []
        if tk.has_key('xrad') and len(tk['xrad']) > 0:
            for tr, tv in tk['xrad'].iteritems():
                # check if a radical exists in db.radical
                rrad = mgx.findOne("radical", {"radical": tr})
                xrad = {}
                if rrad:
                    xrad = {
                        "rad_id": rrad['_id'],
                        "alt": rrad['alt'],
                        "radname": rrad['radname']['ja'],
                        "radname_en": rrad['radname']['en']
                    }
                else:
                    rrad = mgx.findOne("kanji", {"kanji": tr})
                    if rrad:
                        # Created Kanji-Kanji relationship
                        xrad = False
                        try:
                            freq = int(rrad['freq'])
                        except:
                            freq = 0
                        xnodes.append(
                            py2neo.Node("Kanji",
                                        kanji=rrad['kanji'],
                                        ucs=rrad['_id'],
                                        freq=freq))
                        xrels.append(
                            py2neo.Relationship(knode,
                                                "CONTAINS",
                                                xnodes[-1],
                                                position=tv.get(
                                                    'position', None)))
                    else:
                        xrad = {"non_standard": True}
                if xrad:
                    xnodes.append(py2neo.Node("Radical", radical=tr, **xrad))
                    xrels.append(
                        py2neo.Relationship(knode,
                                            "CONTAINS",
                                            xnodes[-1],
                                            position=tv.get('position', None)))

        elif tk.has_key('krad'):
            for tr in tk['krad']:
                # check if a radical exists in db.radical
                rrad = mgx.findOne("radical", {"radical": tr})
                xrad = {}
                if rrad:
                    xrad = {
                        "rad_id": rrad['_id'],
                        "alt": rrad['alt'],
                        "radname": rrad['radname']['ja'],
                        "radname_en": rrad['radname']['en']
                    }
                else:
                    rrad = mgx.findOne("kanji", {"kanji": tr})
                    if rrad:
                        # Created Kanji-Kanji relationship
                        xrad = False
                        try:
                            freq = int(rrad['freq'])
                        except:
                            freq = 0
                        xnodes.append(
                            py2neo.Node("Kanji",
                                        kanji=rrad['kanji'],
                                        ucs=rrad['_id'],
                                        freq=freq))
                        xrels.append(
                            py2neo.Relationship(knode, "CONTAINS", xnodes[-1]))
                    else:
                        xrad = {"non_standard": True}
                if xrad:
                    xnodes.append(py2neo.Node("Radical", radical=tr, **xrad))
                    xrels.append(
                        py2neo.Relationship(knode, "CONTAINS", xnodes[-1]))

        # Senses
        if tk.has_key('meaning') and tk['meaning'].get('en'):
            for ts in tk['meaning']['en']:
                xnodes.append(py2neo.Node("Sense", sense=ts, lang="en"))
                xrels.append(py2neo.Relationship(knode, "MEANS", xnodes[-1]))

        # Readings (on-yomi, kun-yomi, nanori)
        if tk.has_key('reading'):
            if tk['reading'].has_key('ja_on'):
                for tr in tk['reading']['ja_on']:
                    xnodes.append(py2neo.Node("Reading", reading=tr))
                    xrels.append(
                        py2neo.Relationship(knode,
                                            "READS",
                                            xnodes[-1],
                                            yomi="on"))

            if tk['reading'].has_key('ja_kun'):
                for tr in tk['reading']['ja_kun']:
                    xnodes.append(py2neo.Node("Reading", reading=tr))
                    xrels.append(
                        py2neo.Relationship(knode,
                                            "READS",
                                            xnodes[-1],
                                            yomi="kun"))

            if tk['reading'].has_key('nanori'):
                for tr in tk['reading']['nanori']:
                    xnodes.append(py2neo.Node("Reading", reading=tr))
                    xrels.append(
                        py2neo.Relationship(knode,
                                            "READS",
                                            xnodes[-1],
                                            yomi="nanori"))

        # Joyo
        if tk.has_key('grade') and tk.has_key('jindex'):
            xnodes.append(py2neo.Node("Joyo", joyo=int(tk['grade'])))
            xrels.append(
                py2neo.Relationship(xnodes[-1],
                                    "SUBSET",
                                    knode,
                                    jindex=tk['jindex']))

        # JLPT
        if tk.has_key('jlpt') and isinstance(tk['jlpt'], int):
            xnodes.append(py2neo.Node("Jlpt", jlpt=int(tk['jlpt'])))
            xrels.append(py2neo.Relationship(xnodes[-1], "SUBSET", knode))

        # SKIP
        if tk.has_key('qcode') and tk['qcode'].has_key('skip'):
            xnodes.append(py2neo.Node("Skip", skip=tk['qcode']['skip']))
            xrels.append(py2neo.Relationship(knode, "WRITTEN", xnodes[-1]))

        # Create Kanji node
        try:
            neo.create(knode)
        except Exception as e:
            logexc(e, u'Failed to create Kanji node')

        # Create nodes
        for tnode in xnodes:
            try:
                neo.create(tnode)
            except Exception as e:
                logexc(e, u'Failed to create aux node')

        # Build relations
        for trel in xrels:
            # Check if Nodes are bound
            sn = trel.start_node
            en = trel.end_node
            # if start node is not bound, then attempt a lookup
            if not sn.bound:
                nlab = list(sn.labels)[0]
                nsn = neo.find_one(nlab, nlab.lower(), sn[nlab.lower()])
                if nsn:
                    logthis(">>> Xref OK: %s '%s'" % (nlab, sn[nlab.lower()]),
                            loglevel=LL.DEBUG)
                    sn = nsn
            # if end node is not bound, then attempt a lookup
            if not en.bound:
                elab = list(en.labels)[0]
                nen = neo.find_one(elab, elab.lower(), en[elab.lower()])
                if nen:
                    logthis(">>> Xref OK: %s '%s'" % (elab, en[elab.lower()]),
                            loglevel=LL.DEBUG)
                    en = nen
            # Rebuild relationship
            rrel = py2neo.Relationship(sn, trel.type, en, **trel.properties)
            try:
                neo.create_unique(rrel)
            except Exception as e:
                logexc(e, "Failed to build relationship")
Ejemplo n.º 9
0
def run(xconfig):
    """kvgbuild entry point"""
    # get input directory
    if xconfig.run.infile:
        inpath = os.path.realpath(xconfig.run.infile)
        if not os.path.exists(inpath):
            failwith(ER.NOTFOUND, "Specified directory not found")
        elif not os.path.isdir(inpath):
            failwith(ER.CONF_BAD, "Must specify a directory, not a file")

    # get output directory
    if xconfig.run.output:
        outpath = os.path.realpath(xconfig.run.output)
        if not os.path.exists(outpath):
            failwith(ER.NOTFOUND, "Specified directory not found")
        elif not os.path.isdir(outpath):
            failwith(ER.CONF_BAD, "Must specify a directory, not a file")

    # check for extra options
    margs = xconfig.run.modargs
    if "norender" in margs:
        render = False
    else:
        render = True

    kflist = os.listdir(inpath)
    ftots = len(kflist)
    fmatchy = re.compile('^[0-9a-f]{5}.svg$')
    ftit = 0

    # build PNG images
    if render:
        logthis("Rendering stroke-order diagrams...",loglevel=LL.INFO)
        for tf in kflist:
            ftit += 1
            if fmatchy.match(tf):
                logthis("[ %i / %i ] Processing" % (ftit,ftots),suffix=tf,loglevel=LL.VERBOSE)
                thiskvg = openKvg(inpath+'/'+tf)
                colorGroups(thiskvg)
                thispng = inkscapeKvg(thiskvg)

                pf = open(outpath + '/' + tf.split('.')[0] + '.png','w')
                pf.write(thispng)
                pf.close()

                # destroy XML DOM object
                thiskvg.unlink()

    # connect to mongo
    logthis("Connecting to",suffix=xconfig.mongo.uri,loglevel=LL.INFO)
    mdx = mongo(xconfig.mongo.uri)

    # parse radical data
    logthis("Parsing kanji radical data...",loglevel=LL.INFO)
    for tf in kflist:
        ftit += 1
        if fmatchy.match(tf):
            radlist = {}
            kid = os.path.splitext(os.path.split(tf)[1])[0]
            logthis("[ %i / %i ] Processing" % (ftit,ftots),suffix="%s [%s]" % (tf,kid),loglevel=LL.VERBOSE)

            # read
            with codecs.open(inpath+'/'+tf,'r','utf-8') as f:
                thiskvg = f.read()

            # parse with BS4 & lxml
            bs = BeautifulSoup(thiskvg,'xml')
            kvg = bs.find('g', { 'id': "kvg:"+kid })

            # weed out the nasties
            if not kvg:
                continue
            kanji = kvg.attrs.get('element',None)
            if not kanji:
                continue
            if not kvg:
                logthis("XPath /svg/g/g[id:%s] not found" % (kid),loglevel=LL.ERROR)
                continue

            # parse attributes
            for tg in kvg.find_all('g'):
                trad = tg.attrs
                if trad.has_key('id'): del(trad['id'])

                if not trad.has_key('original'):
                    if trad.has_key('element'):
                        trad['original'] = tg.attrs['element']
                    else:
                        logthis("Skipping:",suffix=trad,loglevel=LL.DEBUG)
                        continue

                # get position from parent if not specified
                if not trad.has_key('position'):
                    if tg.parent.attrs.get('position',None):
                        trad['position'] = tg.parent.attrs.get('position',None)
                    else:
                        if tg.parent.parent.attrs.has_key('position'):
                            trad['position'] = tg.parent.parent.attrs.get('position',None)

                # handle kanji with multiple instances of the same radical
                if radlist.has_key(trad['original']):
                    logthis("---- MERGE:\n%s" % (print_r(radlist[trad['original']])),ccode=C.WHT,loglevel=LL.DEBUG)
                    # merge sets
                    radlist[trad['original']] = setMerge(radlist[trad['original']], trad)
                    # create a 'count' key and increment it
                    if radlist[trad['original']].has_key('count'):
                        radlist[trad['original']]['count'] = int(radlist[trad['original']]['count']) + 1
                    else:
                        radlist[trad['original']]['count'] = 2
                else:
                    radlist[trad['original']] = trad

            mdx.update_set('kanji', "%x" % ord(kanji), { 'xrad': radlist } )
            logthis("** Committed entry:\n",suffix=print_r(radlist),loglevel=LL.DEBUG)
Ejemplo n.º 10
0
def parse_jmdict(kdfile, seqbase=3000000):
    """
    Parse JMDict/JMnedict XML files
    """
    global krdex
    logthis("Parsing JMDict/JMnedict XML file",
            suffix=kdfile,
            loglevel=LL.INFO)

    # parse XML as a stream using lxml etree parser
    elist = {}
    curEntry = {}
    entries = 0
    entList = {}
    revEntList = {}
    for event, elem in etree.iterparse(kdfile, events=('end', 'start-ns')):
        if event == "end" and elem.tag == "entry":
            # resolve entities
            if not entList:
                entList, revEntList = resolveEntities(
                    elem.getroottree().docinfo.internalDTD.entities())

            # ent_seq
            curEntry['ent_seq'] = elem.find('ent_seq').text

            # set _id
            curEntry['_id'] = curEntry['ent_seq']

            ## k_ele
            kf_pmax = 0
            if elem.find('k_ele') is not None:
                curEntry['k_ele'] = []
                for sv in elem.findall('k_ele'):
                    kele = {}

                    # k_ele.keb
                    kele['keb'] = sv.find('keb').text

                    # k_ele.ke_inf
                    for ssv in sv.findall('ke_inf'):
                        if not kele.has_key('ke_inf'):
                            kele['ke_inf'] = {}
                        kele['ke_inf'][revEntList[ssv.text]] = ssv.text

                    # k_ele.ke_pri
                    for ssv in sv.findall('ke_pri'):
                        if not kele.has_key('ke_pri'):
                            kele['ke_pri'] = []
                        kele['ke_pri'].append(ssv.text)
                        kf_pmax += priodex[ssv.text]

                    curEntry['k_ele'].append(kele)

            curEntry['kf_pmax'] = kf_pmax

            ## r_ele
            rf_pmax = 0
            if elem.find('r_ele') is not None:
                curEntry['r_ele'] = []
                for sv in elem.findall('r_ele'):
                    rele = {}

                    # r_ele.reb
                    rele['reb'] = sv.find('reb').text

                    # r_ele.re_nokanji
                    if sv.find('re_nokanji') is not None:
                        rele['re_nokanji'] = True

                    # r_ele.restr
                    for ssv in sv.findall('re_restr'):
                        if not rele.has_key('re_restr'):
                            rele['re_restr'] = []
                        rele['re_restr'].append(ssv.text)

                    # r_ele.re_inf
                    for ssv in sv.findall('re_inf'):
                        if not rele.has_key('re_inf'):
                            rele['re_inf'] = {}
                        rele['re_inf'][revEntList[ssv.text]] = ssv.text

                    # r_ele.re_pri
                    for ssv in sv.findall('re_pri'):
                        if not rele.has_key('re_pri'):
                            rele['re_pri'] = []
                        rele['re_pri'].append(ssv.text)
                        rf_pmax += priodex[ssv.text]

                    curEntry['r_ele'].append(rele)

            curEntry['rf_pmax'] = rf_pmax

            ## sense (JMDict)
            if elem.find('sense') is not None:
                curEntry['sense'] = []
                for sv in elem.findall('sense'):
                    sen = {}

                    # sense.stagk
                    for ssv in sv.findall('stagk'):
                        if not sen.has_key('stagk'):
                            sen['stagk'] = []
                        sen['stagk'].append(ssv.text)

                    # sense.stagr
                    for ssv in sv.findall('stagr'):
                        if not sen.has_key('stagr'):
                            sen['stagr'] = []
                        sen['stagr'].append(ssv.text)

                    # sense.xref
                    for ssv in sv.findall('xref'):
                        if not sen.has_key('xref'):
                            sen['xref'] = []
                        sen['xref'].append(ssv.text)

                    # sense.ant
                    for ssv in sv.findall('ant'):
                        if not sen.has_key('ant'):
                            sen['ant'] = []
                        sen['ant'].append(ssv.text)

                    # sense.ant
                    for ssv in sv.findall('ant'):
                        if not sen.has_key('ant'):
                            sen['ant'] = []
                        sen['ant'].append(ssv.text)

                    # sense.pos
                    for ssv in sv.findall('pos'):
                        if not sen.has_key('pos'):
                            sen['pos'] = {}
                        sen['pos'][revEntList[ssv.text]] = ssv.text

                    # sense.field
                    for ssv in sv.findall('field'):
                        if not sen.has_key('field'):
                            sen['field'] = {}
                        sen['field'][revEntList[ssv.text]] = ssv.text

                    # sense.misc
                    for ssv in sv.findall('misc'):
                        if not sen.has_key('misc'):
                            sen['misc'] = {}
                        sen['misc'][revEntList[ssv.text]] = ssv.text

                    # sense.lsource
                    for ssv in sv.findall('lsource'):
                        if not sen.has_key('lsource'):
                            sen['lsource'] = []
                        sen['lsource'].append(ssv.text)

                    # sense.dial
                    for ssv in sv.findall('dial'):
                        if not sen.has_key('dial'):
                            sen['dial'] = []
                        sen['dial'].append(ssv.text)

                    # sense.gloss
                    if sv.find('gloss') is not None:
                        sen['gloss'] = {}
                        for ssv in sv.findall('gloss'):
                            if len(ssv.attrib):
                                mlang = ssv.attrib.values()[0]
                            else:
                                mlang = "eng"
                            if not sen['gloss'].has_key(mlang):
                                sen['gloss'][mlang] = []
                            sen['gloss'][mlang].append(ssv.text)

                    # sense.example
                    for ssv in sv.findall('example'):
                        if not sen.has_key('example'):
                            sen['example'] = []
                        sen['example'].append(ssv.text)

                    # sense.s_inf
                    for ssv in sv.findall('s_inf'):
                        if not sen.has_key('s_inf'):
                            sen['s_inf'] = []
                        sen['s_inf'].append(ssv.text)

                    # sense.pri
                    for ssv in sv.findall('pri'):
                        if not sen.has_key('pri'):
                            sen['pri'] = []
                        sen['pri'].append(ssv.text)

                    curEntry['sense'].append(sen)

            ## trans (JMnedict)
            if elem.find('trans') is not None:
                curEntry['trans'] = []
                for sv in elem.findall('trans'):
                    tran = {}

                    # trans.name_type
                    for ssv in sv.findall('name_type'):
                        if not tran.has_key('name_type'):
                            tran['name_type'] = []
                        tran['name_type'].append(ssv.text)

                    # trans.xref
                    for ssv in sv.findall('xref'):
                        if not tran.has_key('xref'):
                            tran['xref'] = []
                        tran['xref'].append(ssv.text)

                    # trans.trans_det
                    if sv.find('trans_det') is not None:
                        tran['trans_det'] = {}
                        for ssv in sv.findall('trans_det'):
                            if len(ssv.attrib):
                                mlang = ssv.attrib.values()[0]
                            else:
                                mlang = "eng"
                            if not tran['trans_det'].has_key(mlang):
                                tran['trans_det'][mlang] = []
                            tran['trans_det'][mlang].append(ssv.text)

            elist[curEntry['_id']] = copy.deepcopy(curEntry)
            logthis("Commited entry:\n",
                    suffix=print_r(curEntry),
                    loglevel=LL.DEBUG)
            curEntry.clear()
            elem.clear()
            entries += 1

    logthis("** Entries parsed:", suffix=entries, loglevel=LL.INFO)
    return elist
Ejemplo n.º 11
0
def parse_kanjidic(kdfile):
    """
    Parse KanjiDic2 XML file
    """
    global krdex
    logthis("Parsing KanjiDic2 XML file", suffix=kdfile, loglevel=LL.INFO)

    # parse XML as a stream using lxml etree parser
    elist = {}
    curEntry = {}
    entries = 0
    for event, elem in etree.iterparse(kdfile, events=('end', 'start-ns')):
        if event == "end" and elem.tag == "character":
            # kanji
            curEntry['kanji'] = elem.find('literal').text

            # code points
            curEntry['codepoint'] = {}
            for sv in elem.find('codepoint').findall('cp_value'):
                curEntry['codepoint'][sv.attrib['cp_type']] = sv.text

            # radicals
            curEntry['radical'] = {}
            for sv in elem.find('radical').findall('rad_value'):
                curEntry['radical'][sv.attrib['rad_type']] = sv.text

            ## misc
            misc = elem.find('misc')

            # grade/joyo level
            # NEW: now returns an integer; for hyougaji kanji, this will be zero
            if misc.find('grade') is not None:
                curEntry['grade'] = int(misc.find('grade').text)
            else:
                curEntry['grade'] = 0

            # stroke_count
            # NEW: this will now *always* be an array of ints, whereas before it was
            # a regular string *most* of the time, and an array of strings *sometimes*
            curEntry['stroke_count'] = []
            for sv in misc.findall('stroke_count'):
                curEntry['stroke_count'].append(int(sv.text))

            # freq
            # NEW: this is now converted to an int;
            # if it does not exist, it is null instead of an empty string
            if misc.find('freq') is not None:
                curEntry['freq'] = int(misc.find('freq').text)
            else:
                curEntry['freq'] = None

            # jlpt
            # NEW: if it does not exist, it is null instead of an empty string
            if misc.find('jlpt') is not None:
                curEntry['jlpt'] = int(misc.find('jlpt').text)
            else:
                curEntry['jlpt'] = None

            # variant
            # NEW: this field was not previously parsed by edparse
            curEntry['variant'] = {}
            for sv in misc.findall('variant'):
                if curEntry['variant'].has_key(sv.attrib['var_type']):
                    curEntry['variant'][sv.attrib['var_type']].append(sv.text)
                else:
                    curEntry['variant'][sv.attrib['var_type']] = [sv.text]

            # xref
            curEntry['xref'] = {}
            if elem.find('dic_number') is not None:
                for sv in elem.find('dic_number').findall('dic_ref'):
                    if sv.attrib['dr_type'] == "moro":
                        curEntry['xref'][
                            sv.attrib['dr_type']] = "%02d:%04d/%s" % (
                                int(sv.attrib.get('m_vol', 0)),
                                int(sv.attrib.get('m_page', 0)), sv.text)
                    else:
                        curEntry['xref'][sv.attrib['dr_type']] = sv.text

            # qcode
            # NEW: now handles skip_misclass; types with multiple entries are coerced to lists
            curEntry['qcode'] = {}
            if elem.find('query_code') is not None:
                for sv in elem.find('query_code').findall('q_code'):
                    if sv.attrib.has_key('skip_misclass'):
                        if not curEntry['qcode'].has_key('skip_misclass'):
                            curEntry['qcode']['skip_misclass'] = []
                        curEntry['qcode']['skip_misclass'].append({
                            'misclass':
                            sv.attrib['skip_misclass'],
                            'skip':
                            sv.text
                        })
                    else:
                        if curEntry['qcode'].has_key(sv.attrib['qc_type']):
                            # convert to list if we encounter another entry
                            if curEntry['qcode'][
                                    sv.attrib['qc_type']] is not list:
                                curEntry['qcode'][sv.attrib['qc_type']] = [
                                    curEntry['qcode'][sv.attrib['qc_type']]
                                ]
                            curEntry['qcode'][sv.attrib['qc_type']].append(
                                sv.text)
                        else:
                            curEntry['qcode'][sv.attrib['qc_type']] = sv.text

            ## reading & meaning & nanori
            curEntry['reading'] = {}
            curEntry['meaning'] = {}
            if elem.find('reading_meaning') is not None:
                # nanori
                curEntry['reading']['nanori'] = []
                for sv in elem.find('reading_meaning').findall('nanori'):
                    curEntry['reading']['nanori'].append(sv.text)

                if elem.find('reading_meaning').find('rmgroup') is not None:
                    # reading
                    for sv in elem.find('reading_meaning').find(
                            'rmgroup').findall('reading'):
                        if not curEntry['reading'].has_key(
                                sv.attrib['r_type']):
                            curEntry['reading'][sv.attrib['r_type']] = []
                        curEntry['reading'][sv.attrib['r_type']].append(
                            sv.text)

                    # meaning
                    for sv in elem.find('reading_meaning').find(
                            'rmgroup').findall('meaning'):
                        if sv.attrib.has_key('m_lang'):
                            mlang = sv.attrib['m_lang']
                        else:
                            mlang = 'en'
                        if not curEntry['meaning'].has_key(mlang):
                            curEntry['meaning'][mlang] = []
                        curEntry['meaning'][mlang].append(sv.text)

            # krad: crossref radicals
            if krdex.has_key(curEntry['kanji']):
                curEntry['krad'] = krdex[curEntry['kanji']]

            # set _id for Mongo
            curEntry['_id'] = curEntry['codepoint']['ucs']

            elist[curEntry['_id']] = copy.deepcopy(curEntry)
            logthis("Commited entry:\n",
                    suffix=print_r(curEntry),
                    loglevel=LL.DEBUG)
            curEntry.clear()
            elem.clear()
            entries += 1

    logthis("** Kanji parsed:", suffix=entries, loglevel=LL.INFO)
    return elist
Ejemplo n.º 12
0
def run(xconfig):
    """edparser entry point"""
    # get input directory
    if xconfig.run.infile:
        indir = os.path.realpath(xconfig.run.infile)
        if not os.path.exists(indir):
            failwith(ER.NOTFOUND, "Specified directory not found")
        elif not os.path.isdir(indir):
            failwith(ER.CONF_BAD, "Must specify a directory, not a file")

    # check for extra options
    margs = xconfig.run.modargs

    # find files for conversion
    logthis(">> Using directory:", suffix=indir, loglevel=LL.VERBOSE)
    tmap = {}
    for tf in os.listdir(indir):
        matched = False
        for tm in targets:
            if tmap.has_key(tm):
                continue
            if re.match("^" + tm + ".*$", tf, re.I):
                tmap[tm] = os.path.realpath(indir + '/' + tf)
                matched = True
                logthis("Found match for %s:" % (tm),
                        suffix=tmap[tm],
                        loglevel=LL.VERBOSE)
                break
        if not matched:
            logthis("File skipped:", suffix=tf, loglevel=LL.DEBUG)

    # ensure everybody is here
    if len(set(targets) - set(tmap)) > 0:
        logthis("!! Missing required files:",
                suffix=', '.join(set(targets) - set(tmap)),
                loglevel=LL.ERROR)
        failwith(
            ER.NOTFOUND,
            "All files must be present to build crossrefs. Unable to continue."
        )

    ### Parse input files

    # parse kradfile & kradfile2
    parse_kradfile(tmap['kradfile'])
    parse_kradfile(tmap['kradfile2'])

    # parse kanjidic
    kdex = parse_kanjidic(tmap['kanjidic'])

    # parse jmdict
    jmdict = parse_jmdict(tmap['jmdict'])

    # parse jmnedict
    nedict = parse_jmdict(tmap['jmnedict'])

    ## write output
    if xconfig.run.json:
        # Dump output to JSON file if --json/-j option is used
        logthis(">> Dumping output as JSON to",
                suffix=xconfig.run.json,
                loglevel=LL.INFO)
        try:
            with codecs.open(xconfig.run.json, "w", "utf-8") as f:
                json.dump({
                    'kanji': kdex,
                    'jmdict': jmdict,
                    'nedict': nedict
                },
                          f,
                          indent=4,
                          separators=(',', ': '))
        except Exception as e:
            logexc(e, "Failed to dump output to JSON file")
            failwith(ER.PROCFAIL, "File operation failed. Aborting.")
    else:
        # MongoDB
        update_mongo(xconfig.mongo.uri, kdex, jmdict, nedict)

    return 0