def parse_kradfile(krfile, encoding='euc-jp'): """ Parse KRADFILE & KRADFILE2 These are typically EUC-JP (JIS X 0212) encoded files, but a different encoding can be specified with the 'encoding' parameter """ global krdex logthis("Parsing file", suffix=krfile, loglevel=LL.INFO) # convert file from EUC-JP to Unicode with codecs.open(krfile, 'r', encoding) as f: krraw = f.read() # parse line-by-line kc = 0 for tline in krraw.splitlines(): # skip empty lines & comments if tline[0] == '#' or tline[0] == ' ': continue # split left/right try: rkan, rrads = tline.split(':') except: continue tkanji = rkan.strip() trads = rrads.strip().split() krdex[tkanji] = trads kc += 1 logthis("** KRADFILE Kanji parsed:", suffix=kc, loglevel=LL.INFO)
def fetch(url,outfile,encoding='utf-8'): """ fetch a file from a URL and save it; gzip'd files will be decoded automagically """ ofrp = os.path.realpath(os.path.expanduser(outfile)) logthis("Fetching:",suffix=url,loglevel=LL.INFO) # retrieve file and decompress it on-the-fly (as long as it has a Content-Encoding gzip header) try: r = requests.get(url) r.raise_for_status() except Exception as e: logexc(e, "Failed to retrieve file") return False # coerce to the correct encoding (either UTF-8 or EUC-JP... usually) r.encoding = encoding try: with codecs.open(ofrp,'w',encoding) as f: f.write(r.text) except Exception as e: logexc(e, "Failed to write output to %s" % (outfile)) return False logthis(">> Wrote output to",suffix=ofrp,loglevel=LL.INFO) return True
def update_set(mdx, indata, setname): """ merge each existing entry with new entry """ updated = 0 created = 0 logthis(">> Updating collection:", suffix=setname, loglevel=LL.INFO) for tk, tv in indata.iteritems(): xkan = mdx.findOne(setname, {'_id': tk}) if xkan: # modify existing object with new data iobj = xkan iobj.update(tv) else: iobj = tv if mdx.upsert(setname, tk, iobj)['updatedExisting']: updated += 1 else: created += 1 logthis("update complete - updated: %d / created: %d / total:" % (updated, created), prefix=setname, suffix=(updated + created), loglevel=LL.INFO)
def getRelated(neo, klist, inkanji, limit=20): """ get related kanji for input (inkanji) pass in a dict of all kanji (klist), and a Neo4j Graph object (neo) returns a dict of top (limit) related kanji """ cto = KCounter() cto.clear() logthis("** Kanji:", suffix=inkanji, loglevel=LL.VERBOSE) # match kanji with radicals in the same position radmat = neo.cypher.execute( u"MATCH (k:Kanji)-[e:CONTAINS]-(r:Radical)-[e2:CONTAINS]-(k2:Kanji) WHERE k.kanji = '%s' AND e.position = e2.position RETURN k2.kanji AS kanji, r.radical AS radical, e.position AS position" % (inkanji)) cto.absorb(radmat, score=5) # match kanji with radicals in any position radmat = neo.cypher.execute( u"MATCH (k:Kanji)-[e:CONTAINS]-(r:Radical)-[:CONTAINS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.radical AS radical, e.position AS position" % (inkanji)) cto.absorb(radmat, score=2) # match kanji with same SKIP code radmat = neo.cypher.execute( u"MATCH (k:Kanji)-[:WRITTEN]-(r:Skip)-[:WRITTEN]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.skip AS skip" % (inkanji)) cto.absorb(radmat, score=3) # match kanji with same meaning/sense keywords radmat = neo.cypher.execute( u"MATCH (k:Kanji)-[:MEANS]-(r:Sense)-[:MEANS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.sense AS sense" % (inkanji)) cto.absorb(radmat, score=4) # match kanji with same readings radmat = neo.cypher.execute( u"MATCH (k:Kanji)-[:READS]-(r:Reading)-[:READS]-(k2:Kanji) WHERE k.kanji = '%s' RETURN k2.kanji AS kanji, r.reading AS reading" % (inkanji)) cto.absorb(radmat, score=1) # get top related rtop = cto.sorted()[:limit] okan = {} for tt in rtop: # get crossref data for this kanji ttid = u'%x' % (ord(tt[0])) tto = klist.get(ttid) if tto: tjoyo = tto.get('grade', None) tjdex = tto.get('jindex', None) else: tjoyo = None tjdex = None okan[tt[0]] = {'score': tt[1], 'joyo': tjoyo, 'jindex': tjdex} logthis("-- %s -> %s" % (tt[0], json.dumps(okan[tt[0]])), loglevel=LL.DEBUG) return okan
def update_mongo(mongo_uri, kdex, jmdict, nedict): """ Insert, upsert, or update entries in MongoDB """ # connect to mongo logthis("Connecting to", suffix=mongo_uri, loglevel=LL.INFO) mdx = mongo(mongo_uri) # Kanji update_set(mdx, kdex, 'kanji') # JMDict update_set(mdx, jmdict, 'jmdict') # JMnedict update_set(mdx, nedict, 'jmnedict')
def run(xconfig): """entry point""" # get output directory if xconfig.run.output: outpath = os.path.realpath(xconfig.run.output) if not os.path.exists(outpath): failwith(ER.NOTFOUND, "Specified directory not found") elif not os.path.isdir(outpath): failwith(ER.CONF_BAD, "Must specify a directory, not a file") logthis("Output directory:",suffix=outpath,loglevel=LL.INFO) # fetch files fails = 0 for tf in fmanifest: tfo = re.match('^https?://.+/([^/]+)\.gz$', tf['url']).group(1) if not fetch(tf['url'], outpath+'/'+tfo, tf.get('encoding','utf-8')): fails += 1 return fails
def run(xconfig): """ compile lists of related Kanji using Neo4j """ # check for extra options margs = xconfig.run.modargs # connect to Mongo mgx = mongo(xconfig.mongo.uri) # get all kanji from Mongo kset = mgx.find("kanji", {}, rdict=True) logthis("** Kanji objects:", suffix=len(kset), loglevel=LL.INFO) # connect to Neo4j try: neo = py2neo.Graph(xconfig.neo4j.uri) ncount = neo.cypher.execute( 'MATCH (n) RETURN count(*) AS ncount')[0]['ncount'] except Exception as e: logexc(e, "Failed to connect to Neo4j dataset") failwith(ER.PROCFAIL, "Unable to continue. Aborting.") logthis("** Nodes in Neo4j dataset:", suffix=ncount, loglevel=LL.INFO) # check through all kanji hasRelated = 0 for tkan in kset: # get the top 20 highest-scoring related nodes trel = getRelated(neo, kset, kset[tkan]['kanji']) # update kanji entry in Mongo mgx.update_set("kanji", tkan, {'krelated': trel}) if len(trel) > 0: hasRelated += 1 logthis("** Complete. Kanji with krelated data:", suffix=hasRelated, loglevel=LL.INFO)
def run(xconfig): """ build graph in Neo4j based on the Kanji dataset """ # check for extra options margs = xconfig.run.modargs # connect to Mongo mgx = mongo(xconfig.mongo.uri) # get all kanji from Mongo kset = mgx.find("kanji", {}) logthis("** Kanji objects:", suffix=len(kset), loglevel=LL.INFO) # connect to Neo4j try: neo = py2neo.Graph(xconfig.neo4j.uri) ncount = neo.cypher.execute('MATCH (n) RETURN count(*)') except Exception as e: logexc(e, "Failed to connect to Neo4j dataset") failwith(ER.PROCFAIL, "Unable to continue. Aborting.") # if 'clear' is passed as an extra parg, then drop all existing nodes/rels if 'clear' in margs: logthis("Deleting existing data...", loglevel=LL.INFO) neo.cypher.execute("MATCH (n) DETACH DELETE n") # create node constraints logthis("Creating constraints...", loglevel=LL.VERBOSE) neo.cypher.execute( "CREATE CONSTRAINT ON (k:Kanji) ASSERT k.kanji IS UNIQUE") neo.cypher.execute( "CREATE CONSTRAINT ON (r:Radical) ASSERT r.radical IS UNIQUE") neo.cypher.execute( "CREATE CONSTRAINT ON (s:Sense) ASSERT s.sense IS UNIQUE") neo.cypher.execute( "CREATE CONSTRAINT ON (r:Reading) ASSERT r.reading IS UNIQUE") neo.cypher.execute("CREATE CONSTRAINT ON (g:Joyo) ASSERT g.joyo IS UNIQUE") neo.cypher.execute("CREATE CONSTRAINT ON (g:Jlpt) ASSERT g.jlpt IS UNIQUE") neo.cypher.execute("CREATE CONSTRAINT ON (k:Skip) ASSERT k.skip IS UNIQUE") # Build nodes & relationships logthis("** Building graph...", loglevel=LL.INFO) for kk, tk in kset.iteritems(): logthis(">>>------[ %5d ] Kanji node <%s> -----" % (kk, tk['kanji']), loglevel=LL.DEBUG) # Kanji try: freq = int(tk['freq']) except: freq = 0 knode = py2neo.Node("Kanji", kanji=tk['kanji'], ucs=tk['_id'], freq=freq) # Radicals xnodes = [] xrels = [] if tk.has_key('xrad') and len(tk['xrad']) > 0: for tr, tv in tk['xrad'].iteritems(): # check if a radical exists in db.radical rrad = mgx.findOne("radical", {"radical": tr}) xrad = {} if rrad: xrad = { "rad_id": rrad['_id'], "alt": rrad['alt'], "radname": rrad['radname']['ja'], "radname_en": rrad['radname']['en'] } else: rrad = mgx.findOne("kanji", {"kanji": tr}) if rrad: # Created Kanji-Kanji relationship xrad = False try: freq = int(rrad['freq']) except: freq = 0 xnodes.append( py2neo.Node("Kanji", kanji=rrad['kanji'], ucs=rrad['_id'], freq=freq)) xrels.append( py2neo.Relationship(knode, "CONTAINS", xnodes[-1], position=tv.get( 'position', None))) else: xrad = {"non_standard": True} if xrad: xnodes.append(py2neo.Node("Radical", radical=tr, **xrad)) xrels.append( py2neo.Relationship(knode, "CONTAINS", xnodes[-1], position=tv.get('position', None))) elif tk.has_key('krad'): for tr in tk['krad']: # check if a radical exists in db.radical rrad = mgx.findOne("radical", {"radical": tr}) xrad = {} if rrad: xrad = { "rad_id": rrad['_id'], "alt": rrad['alt'], "radname": rrad['radname']['ja'], "radname_en": rrad['radname']['en'] } else: rrad = mgx.findOne("kanji", {"kanji": tr}) if rrad: # Created Kanji-Kanji relationship xrad = False try: freq = int(rrad['freq']) except: freq = 0 xnodes.append( py2neo.Node("Kanji", kanji=rrad['kanji'], ucs=rrad['_id'], freq=freq)) xrels.append( py2neo.Relationship(knode, "CONTAINS", xnodes[-1])) else: xrad = {"non_standard": True} if xrad: xnodes.append(py2neo.Node("Radical", radical=tr, **xrad)) xrels.append( py2neo.Relationship(knode, "CONTAINS", xnodes[-1])) # Senses if tk.has_key('meaning') and tk['meaning'].get('en'): for ts in tk['meaning']['en']: xnodes.append(py2neo.Node("Sense", sense=ts, lang="en")) xrels.append(py2neo.Relationship(knode, "MEANS", xnodes[-1])) # Readings (on-yomi, kun-yomi, nanori) if tk.has_key('reading'): if tk['reading'].has_key('ja_on'): for tr in tk['reading']['ja_on']: xnodes.append(py2neo.Node("Reading", reading=tr)) xrels.append( py2neo.Relationship(knode, "READS", xnodes[-1], yomi="on")) if tk['reading'].has_key('ja_kun'): for tr in tk['reading']['ja_kun']: xnodes.append(py2neo.Node("Reading", reading=tr)) xrels.append( py2neo.Relationship(knode, "READS", xnodes[-1], yomi="kun")) if tk['reading'].has_key('nanori'): for tr in tk['reading']['nanori']: xnodes.append(py2neo.Node("Reading", reading=tr)) xrels.append( py2neo.Relationship(knode, "READS", xnodes[-1], yomi="nanori")) # Joyo if tk.has_key('grade') and tk.has_key('jindex'): xnodes.append(py2neo.Node("Joyo", joyo=int(tk['grade']))) xrels.append( py2neo.Relationship(xnodes[-1], "SUBSET", knode, jindex=tk['jindex'])) # JLPT if tk.has_key('jlpt') and isinstance(tk['jlpt'], int): xnodes.append(py2neo.Node("Jlpt", jlpt=int(tk['jlpt']))) xrels.append(py2neo.Relationship(xnodes[-1], "SUBSET", knode)) # SKIP if tk.has_key('qcode') and tk['qcode'].has_key('skip'): xnodes.append(py2neo.Node("Skip", skip=tk['qcode']['skip'])) xrels.append(py2neo.Relationship(knode, "WRITTEN", xnodes[-1])) # Create Kanji node try: neo.create(knode) except Exception as e: logexc(e, u'Failed to create Kanji node') # Create nodes for tnode in xnodes: try: neo.create(tnode) except Exception as e: logexc(e, u'Failed to create aux node') # Build relations for trel in xrels: # Check if Nodes are bound sn = trel.start_node en = trel.end_node # if start node is not bound, then attempt a lookup if not sn.bound: nlab = list(sn.labels)[0] nsn = neo.find_one(nlab, nlab.lower(), sn[nlab.lower()]) if nsn: logthis(">>> Xref OK: %s '%s'" % (nlab, sn[nlab.lower()]), loglevel=LL.DEBUG) sn = nsn # if end node is not bound, then attempt a lookup if not en.bound: elab = list(en.labels)[0] nen = neo.find_one(elab, elab.lower(), en[elab.lower()]) if nen: logthis(">>> Xref OK: %s '%s'" % (elab, en[elab.lower()]), loglevel=LL.DEBUG) en = nen # Rebuild relationship rrel = py2neo.Relationship(sn, trel.type, en, **trel.properties) try: neo.create_unique(rrel) except Exception as e: logexc(e, "Failed to build relationship")
def run(xconfig): """kvgbuild entry point""" # get input directory if xconfig.run.infile: inpath = os.path.realpath(xconfig.run.infile) if not os.path.exists(inpath): failwith(ER.NOTFOUND, "Specified directory not found") elif not os.path.isdir(inpath): failwith(ER.CONF_BAD, "Must specify a directory, not a file") # get output directory if xconfig.run.output: outpath = os.path.realpath(xconfig.run.output) if not os.path.exists(outpath): failwith(ER.NOTFOUND, "Specified directory not found") elif not os.path.isdir(outpath): failwith(ER.CONF_BAD, "Must specify a directory, not a file") # check for extra options margs = xconfig.run.modargs if "norender" in margs: render = False else: render = True kflist = os.listdir(inpath) ftots = len(kflist) fmatchy = re.compile('^[0-9a-f]{5}.svg$') ftit = 0 # build PNG images if render: logthis("Rendering stroke-order diagrams...",loglevel=LL.INFO) for tf in kflist: ftit += 1 if fmatchy.match(tf): logthis("[ %i / %i ] Processing" % (ftit,ftots),suffix=tf,loglevel=LL.VERBOSE) thiskvg = openKvg(inpath+'/'+tf) colorGroups(thiskvg) thispng = inkscapeKvg(thiskvg) pf = open(outpath + '/' + tf.split('.')[0] + '.png','w') pf.write(thispng) pf.close() # destroy XML DOM object thiskvg.unlink() # connect to mongo logthis("Connecting to",suffix=xconfig.mongo.uri,loglevel=LL.INFO) mdx = mongo(xconfig.mongo.uri) # parse radical data logthis("Parsing kanji radical data...",loglevel=LL.INFO) for tf in kflist: ftit += 1 if fmatchy.match(tf): radlist = {} kid = os.path.splitext(os.path.split(tf)[1])[0] logthis("[ %i / %i ] Processing" % (ftit,ftots),suffix="%s [%s]" % (tf,kid),loglevel=LL.VERBOSE) # read with codecs.open(inpath+'/'+tf,'r','utf-8') as f: thiskvg = f.read() # parse with BS4 & lxml bs = BeautifulSoup(thiskvg,'xml') kvg = bs.find('g', { 'id': "kvg:"+kid }) # weed out the nasties if not kvg: continue kanji = kvg.attrs.get('element',None) if not kanji: continue if not kvg: logthis("XPath /svg/g/g[id:%s] not found" % (kid),loglevel=LL.ERROR) continue # parse attributes for tg in kvg.find_all('g'): trad = tg.attrs if trad.has_key('id'): del(trad['id']) if not trad.has_key('original'): if trad.has_key('element'): trad['original'] = tg.attrs['element'] else: logthis("Skipping:",suffix=trad,loglevel=LL.DEBUG) continue # get position from parent if not specified if not trad.has_key('position'): if tg.parent.attrs.get('position',None): trad['position'] = tg.parent.attrs.get('position',None) else: if tg.parent.parent.attrs.has_key('position'): trad['position'] = tg.parent.parent.attrs.get('position',None) # handle kanji with multiple instances of the same radical if radlist.has_key(trad['original']): logthis("---- MERGE:\n%s" % (print_r(radlist[trad['original']])),ccode=C.WHT,loglevel=LL.DEBUG) # merge sets radlist[trad['original']] = setMerge(radlist[trad['original']], trad) # create a 'count' key and increment it if radlist[trad['original']].has_key('count'): radlist[trad['original']]['count'] = int(radlist[trad['original']]['count']) + 1 else: radlist[trad['original']]['count'] = 2 else: radlist[trad['original']] = trad mdx.update_set('kanji', "%x" % ord(kanji), { 'xrad': radlist } ) logthis("** Committed entry:\n",suffix=print_r(radlist),loglevel=LL.DEBUG)
def parse_jmdict(kdfile, seqbase=3000000): """ Parse JMDict/JMnedict XML files """ global krdex logthis("Parsing JMDict/JMnedict XML file", suffix=kdfile, loglevel=LL.INFO) # parse XML as a stream using lxml etree parser elist = {} curEntry = {} entries = 0 entList = {} revEntList = {} for event, elem in etree.iterparse(kdfile, events=('end', 'start-ns')): if event == "end" and elem.tag == "entry": # resolve entities if not entList: entList, revEntList = resolveEntities( elem.getroottree().docinfo.internalDTD.entities()) # ent_seq curEntry['ent_seq'] = elem.find('ent_seq').text # set _id curEntry['_id'] = curEntry['ent_seq'] ## k_ele kf_pmax = 0 if elem.find('k_ele') is not None: curEntry['k_ele'] = [] for sv in elem.findall('k_ele'): kele = {} # k_ele.keb kele['keb'] = sv.find('keb').text # k_ele.ke_inf for ssv in sv.findall('ke_inf'): if not kele.has_key('ke_inf'): kele['ke_inf'] = {} kele['ke_inf'][revEntList[ssv.text]] = ssv.text # k_ele.ke_pri for ssv in sv.findall('ke_pri'): if not kele.has_key('ke_pri'): kele['ke_pri'] = [] kele['ke_pri'].append(ssv.text) kf_pmax += priodex[ssv.text] curEntry['k_ele'].append(kele) curEntry['kf_pmax'] = kf_pmax ## r_ele rf_pmax = 0 if elem.find('r_ele') is not None: curEntry['r_ele'] = [] for sv in elem.findall('r_ele'): rele = {} # r_ele.reb rele['reb'] = sv.find('reb').text # r_ele.re_nokanji if sv.find('re_nokanji') is not None: rele['re_nokanji'] = True # r_ele.restr for ssv in sv.findall('re_restr'): if not rele.has_key('re_restr'): rele['re_restr'] = [] rele['re_restr'].append(ssv.text) # r_ele.re_inf for ssv in sv.findall('re_inf'): if not rele.has_key('re_inf'): rele['re_inf'] = {} rele['re_inf'][revEntList[ssv.text]] = ssv.text # r_ele.re_pri for ssv in sv.findall('re_pri'): if not rele.has_key('re_pri'): rele['re_pri'] = [] rele['re_pri'].append(ssv.text) rf_pmax += priodex[ssv.text] curEntry['r_ele'].append(rele) curEntry['rf_pmax'] = rf_pmax ## sense (JMDict) if elem.find('sense') is not None: curEntry['sense'] = [] for sv in elem.findall('sense'): sen = {} # sense.stagk for ssv in sv.findall('stagk'): if not sen.has_key('stagk'): sen['stagk'] = [] sen['stagk'].append(ssv.text) # sense.stagr for ssv in sv.findall('stagr'): if not sen.has_key('stagr'): sen['stagr'] = [] sen['stagr'].append(ssv.text) # sense.xref for ssv in sv.findall('xref'): if not sen.has_key('xref'): sen['xref'] = [] sen['xref'].append(ssv.text) # sense.ant for ssv in sv.findall('ant'): if not sen.has_key('ant'): sen['ant'] = [] sen['ant'].append(ssv.text) # sense.ant for ssv in sv.findall('ant'): if not sen.has_key('ant'): sen['ant'] = [] sen['ant'].append(ssv.text) # sense.pos for ssv in sv.findall('pos'): if not sen.has_key('pos'): sen['pos'] = {} sen['pos'][revEntList[ssv.text]] = ssv.text # sense.field for ssv in sv.findall('field'): if not sen.has_key('field'): sen['field'] = {} sen['field'][revEntList[ssv.text]] = ssv.text # sense.misc for ssv in sv.findall('misc'): if not sen.has_key('misc'): sen['misc'] = {} sen['misc'][revEntList[ssv.text]] = ssv.text # sense.lsource for ssv in sv.findall('lsource'): if not sen.has_key('lsource'): sen['lsource'] = [] sen['lsource'].append(ssv.text) # sense.dial for ssv in sv.findall('dial'): if not sen.has_key('dial'): sen['dial'] = [] sen['dial'].append(ssv.text) # sense.gloss if sv.find('gloss') is not None: sen['gloss'] = {} for ssv in sv.findall('gloss'): if len(ssv.attrib): mlang = ssv.attrib.values()[0] else: mlang = "eng" if not sen['gloss'].has_key(mlang): sen['gloss'][mlang] = [] sen['gloss'][mlang].append(ssv.text) # sense.example for ssv in sv.findall('example'): if not sen.has_key('example'): sen['example'] = [] sen['example'].append(ssv.text) # sense.s_inf for ssv in sv.findall('s_inf'): if not sen.has_key('s_inf'): sen['s_inf'] = [] sen['s_inf'].append(ssv.text) # sense.pri for ssv in sv.findall('pri'): if not sen.has_key('pri'): sen['pri'] = [] sen['pri'].append(ssv.text) curEntry['sense'].append(sen) ## trans (JMnedict) if elem.find('trans') is not None: curEntry['trans'] = [] for sv in elem.findall('trans'): tran = {} # trans.name_type for ssv in sv.findall('name_type'): if not tran.has_key('name_type'): tran['name_type'] = [] tran['name_type'].append(ssv.text) # trans.xref for ssv in sv.findall('xref'): if not tran.has_key('xref'): tran['xref'] = [] tran['xref'].append(ssv.text) # trans.trans_det if sv.find('trans_det') is not None: tran['trans_det'] = {} for ssv in sv.findall('trans_det'): if len(ssv.attrib): mlang = ssv.attrib.values()[0] else: mlang = "eng" if not tran['trans_det'].has_key(mlang): tran['trans_det'][mlang] = [] tran['trans_det'][mlang].append(ssv.text) elist[curEntry['_id']] = copy.deepcopy(curEntry) logthis("Commited entry:\n", suffix=print_r(curEntry), loglevel=LL.DEBUG) curEntry.clear() elem.clear() entries += 1 logthis("** Entries parsed:", suffix=entries, loglevel=LL.INFO) return elist
def parse_kanjidic(kdfile): """ Parse KanjiDic2 XML file """ global krdex logthis("Parsing KanjiDic2 XML file", suffix=kdfile, loglevel=LL.INFO) # parse XML as a stream using lxml etree parser elist = {} curEntry = {} entries = 0 for event, elem in etree.iterparse(kdfile, events=('end', 'start-ns')): if event == "end" and elem.tag == "character": # kanji curEntry['kanji'] = elem.find('literal').text # code points curEntry['codepoint'] = {} for sv in elem.find('codepoint').findall('cp_value'): curEntry['codepoint'][sv.attrib['cp_type']] = sv.text # radicals curEntry['radical'] = {} for sv in elem.find('radical').findall('rad_value'): curEntry['radical'][sv.attrib['rad_type']] = sv.text ## misc misc = elem.find('misc') # grade/joyo level # NEW: now returns an integer; for hyougaji kanji, this will be zero if misc.find('grade') is not None: curEntry['grade'] = int(misc.find('grade').text) else: curEntry['grade'] = 0 # stroke_count # NEW: this will now *always* be an array of ints, whereas before it was # a regular string *most* of the time, and an array of strings *sometimes* curEntry['stroke_count'] = [] for sv in misc.findall('stroke_count'): curEntry['stroke_count'].append(int(sv.text)) # freq # NEW: this is now converted to an int; # if it does not exist, it is null instead of an empty string if misc.find('freq') is not None: curEntry['freq'] = int(misc.find('freq').text) else: curEntry['freq'] = None # jlpt # NEW: if it does not exist, it is null instead of an empty string if misc.find('jlpt') is not None: curEntry['jlpt'] = int(misc.find('jlpt').text) else: curEntry['jlpt'] = None # variant # NEW: this field was not previously parsed by edparse curEntry['variant'] = {} for sv in misc.findall('variant'): if curEntry['variant'].has_key(sv.attrib['var_type']): curEntry['variant'][sv.attrib['var_type']].append(sv.text) else: curEntry['variant'][sv.attrib['var_type']] = [sv.text] # xref curEntry['xref'] = {} if elem.find('dic_number') is not None: for sv in elem.find('dic_number').findall('dic_ref'): if sv.attrib['dr_type'] == "moro": curEntry['xref'][ sv.attrib['dr_type']] = "%02d:%04d/%s" % ( int(sv.attrib.get('m_vol', 0)), int(sv.attrib.get('m_page', 0)), sv.text) else: curEntry['xref'][sv.attrib['dr_type']] = sv.text # qcode # NEW: now handles skip_misclass; types with multiple entries are coerced to lists curEntry['qcode'] = {} if elem.find('query_code') is not None: for sv in elem.find('query_code').findall('q_code'): if sv.attrib.has_key('skip_misclass'): if not curEntry['qcode'].has_key('skip_misclass'): curEntry['qcode']['skip_misclass'] = [] curEntry['qcode']['skip_misclass'].append({ 'misclass': sv.attrib['skip_misclass'], 'skip': sv.text }) else: if curEntry['qcode'].has_key(sv.attrib['qc_type']): # convert to list if we encounter another entry if curEntry['qcode'][ sv.attrib['qc_type']] is not list: curEntry['qcode'][sv.attrib['qc_type']] = [ curEntry['qcode'][sv.attrib['qc_type']] ] curEntry['qcode'][sv.attrib['qc_type']].append( sv.text) else: curEntry['qcode'][sv.attrib['qc_type']] = sv.text ## reading & meaning & nanori curEntry['reading'] = {} curEntry['meaning'] = {} if elem.find('reading_meaning') is not None: # nanori curEntry['reading']['nanori'] = [] for sv in elem.find('reading_meaning').findall('nanori'): curEntry['reading']['nanori'].append(sv.text) if elem.find('reading_meaning').find('rmgroup') is not None: # reading for sv in elem.find('reading_meaning').find( 'rmgroup').findall('reading'): if not curEntry['reading'].has_key( sv.attrib['r_type']): curEntry['reading'][sv.attrib['r_type']] = [] curEntry['reading'][sv.attrib['r_type']].append( sv.text) # meaning for sv in elem.find('reading_meaning').find( 'rmgroup').findall('meaning'): if sv.attrib.has_key('m_lang'): mlang = sv.attrib['m_lang'] else: mlang = 'en' if not curEntry['meaning'].has_key(mlang): curEntry['meaning'][mlang] = [] curEntry['meaning'][mlang].append(sv.text) # krad: crossref radicals if krdex.has_key(curEntry['kanji']): curEntry['krad'] = krdex[curEntry['kanji']] # set _id for Mongo curEntry['_id'] = curEntry['codepoint']['ucs'] elist[curEntry['_id']] = copy.deepcopy(curEntry) logthis("Commited entry:\n", suffix=print_r(curEntry), loglevel=LL.DEBUG) curEntry.clear() elem.clear() entries += 1 logthis("** Kanji parsed:", suffix=entries, loglevel=LL.INFO) return elist
def run(xconfig): """edparser entry point""" # get input directory if xconfig.run.infile: indir = os.path.realpath(xconfig.run.infile) if not os.path.exists(indir): failwith(ER.NOTFOUND, "Specified directory not found") elif not os.path.isdir(indir): failwith(ER.CONF_BAD, "Must specify a directory, not a file") # check for extra options margs = xconfig.run.modargs # find files for conversion logthis(">> Using directory:", suffix=indir, loglevel=LL.VERBOSE) tmap = {} for tf in os.listdir(indir): matched = False for tm in targets: if tmap.has_key(tm): continue if re.match("^" + tm + ".*$", tf, re.I): tmap[tm] = os.path.realpath(indir + '/' + tf) matched = True logthis("Found match for %s:" % (tm), suffix=tmap[tm], loglevel=LL.VERBOSE) break if not matched: logthis("File skipped:", suffix=tf, loglevel=LL.DEBUG) # ensure everybody is here if len(set(targets) - set(tmap)) > 0: logthis("!! Missing required files:", suffix=', '.join(set(targets) - set(tmap)), loglevel=LL.ERROR) failwith( ER.NOTFOUND, "All files must be present to build crossrefs. Unable to continue." ) ### Parse input files # parse kradfile & kradfile2 parse_kradfile(tmap['kradfile']) parse_kradfile(tmap['kradfile2']) # parse kanjidic kdex = parse_kanjidic(tmap['kanjidic']) # parse jmdict jmdict = parse_jmdict(tmap['jmdict']) # parse jmnedict nedict = parse_jmdict(tmap['jmnedict']) ## write output if xconfig.run.json: # Dump output to JSON file if --json/-j option is used logthis(">> Dumping output as JSON to", suffix=xconfig.run.json, loglevel=LL.INFO) try: with codecs.open(xconfig.run.json, "w", "utf-8") as f: json.dump({ 'kanji': kdex, 'jmdict': jmdict, 'nedict': nedict }, f, indent=4, separators=(',', ': ')) except Exception as e: logexc(e, "Failed to dump output to JSON file") failwith(ER.PROCFAIL, "File operation failed. Aborting.") else: # MongoDB update_mongo(xconfig.mongo.uri, kdex, jmdict, nedict) return 0