def translateInstanceType(self): #read interlanguage_links_en.ttl picklename = "en_zh" if not os.path.isfile(picklename+'.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) # read the instance type input = self.dataDirectory + "/" + self.inputLangaugeTag + "/instance_types_"+self.inputLangaugeTag+".ttl" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/instance_types_"+self.outputLangaugeTag+".ttl" SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search("(<http://dbpedia\.org/resource/)(.*)(>\s*<.*>\s*<.*>\s*.)", line) if g != None: try: if g.group(2) not in dict: continue #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) except Exception: continue sys.stdout = SavedStdOut
def translateLabels(self): picklename = "zh_en" if not os.path.isfile(picklename+'.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) # read the label type input = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_"+self.outputLangaugeTag+".ttl.old" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_"+self.outputLangaugeTag+".ttl" SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search("(<http://zh.dbpedia\.org/resource/)(.*)(>\s*<.*>\s*\")(.*\"@zh\s*.)", line) if g != None: try: if g.group(2) not in dict: print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) + g.group(2) + "\"@zh ." #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + dict[g.group(2)] + "\"@zh ." else: print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + g.group(2) + "\"@zh ." except Exception: continue sys.stdout = SavedStdOut
def translateRedirectsTransitive(self): # read the instance type input = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_"+self.outputLangaugeTag+".ttl.old" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_"+self.outputLangaugeTag+".ttl" picklename = "zh_en" if not os.path.isfile(picklename+'.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search("(<http://zh\.dbpedia\.org/resource/)(.*)(>\s*<)(http://zh\.dbpedia.org/resource/)(.*)(>\s*.)", line) if g != None: try: if g.group(5) not in dict: print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) +"http://dbpedia.org/resource/" + g.group(5) + g.group(6) else: print '<http://dbpedia.org/resource/' + g.group(2) + g.group(3) + "http://dbpedia.org/resource/" + dict[g.group(5)] + g.group(6) except Exception: continue sys.stdout = SavedStdOut
def translateInstanceType(self): #read interlanguage_links_en.ttl picklename = "en_zh" if not os.path.isfile(picklename + '.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) # read the instance type input = self.dataDirectory + "/" + self.inputLangaugeTag + "/instance_types_" + self.inputLangaugeTag + ".ttl" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/instance_types_" + self.outputLangaugeTag + ".ttl" SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search( "(<http://dbpedia\.org/resource/)(.*)(>\s*<.*>\s*<.*>\s*.)", line) if g != None: try: if g.group(2) not in dict: continue #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) print '<http://dbpedia.org/resource/' + g.group( 2) + g.group(3) except Exception: continue sys.stdout = SavedStdOut
def translateLabels(self): picklename = "zh_en" if not os.path.isfile(picklename + '.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) # read the label type input = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_" + self.outputLangaugeTag + ".ttl.old" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/labels_" + self.outputLangaugeTag + ".ttl" SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search( "(<http://zh.dbpedia\.org/resource/)(.*)(>\s*<.*>\s*\")(.*\"@zh\s*.)", line) if g != None: try: if g.group(2) not in dict: print '<http://dbpedia.org/resource/' + g.group( 2) + g.group(3) + g.group(2) + "\"@zh ." #print '<http://dbpedia.org/resource/' + dict[g.group(2)] + g.group(3) + dict[g.group(2)] + "\"@zh ." else: print '<http://dbpedia.org/resource/' + dict[ g.group(2)] + g.group(3) + g.group( 2) + "\"@zh ." except Exception: continue sys.stdout = SavedStdOut
def getNELCorrectLabel(input, output): reload(sys) sys.setdefaultencoding('utf8') picklename = "en_zh" dict = fio.loadPickle(picklename) parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(input, parser=parser) root = tree.getroot() for child in root: print child.tag, child.attrib # for str in child.findall('string'): # print str.tag, str.attrib, str.text # for keywords in child.findall('keywords'): # print keywords.tag, keywords.attrib, keywords.text # for query in child.findall('query'): # print query.text for query in child.findall('query'): # print query.tex links = GetLink(query.text) if len(links) > 0: for k, v in links.items(): node = ET.Element(tag='link', attrib={ 'lang': 'en', 'entity': k }) node.text = v node.tail = '\n' child.insert(0, node) if len(links) > 0: for k, v in links.items(): if k in dict: k = dict[k] v = 'http://dbpedia.org/resource/' + k else: v = ' ' node = ET.Element(tag='link', attrib={ 'lang': 'zh', 'entity': k }) node.text = v node.tail = '\n' child.insert(0, node) tree.write(output, encoding='utf8')
def getNELCorrectLabel(input, output): reload(sys) sys.setdefaultencoding('utf8') picklename = "en_zh" dict = fio.loadPickle(picklename) parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(input, parser = parser) root = tree.getroot() for child in root: print child.tag, child.attrib # for str in child.findall('string'): # print str.tag, str.attrib, str.text # for keywords in child.findall('keywords'): # print keywords.tag, keywords.attrib, keywords.text # for query in child.findall('query'): # print query.text for query in child.findall('query'): # print query.tex links = GetLink(query.text) if len(links) > 0: for k, v in links.items(): node = ET.Element(tag='link', attrib={'lang':'en', 'entity':k}) node.text = v node.tail = '\n' child.insert(0, node) if len(links) > 0: for k, v in links.items(): if k in dict: k = dict[k] v = 'http://dbpedia.org/resource/' + k else: v = ' ' node = ET.Element(tag='link', attrib={'lang':'zh', 'entity':k}) node.text = v node.tail = '\n' child.insert(0, node) tree.write(output, encoding = 'utf8')
def translateRedirectsTransitive(self): # read the instance type input = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_" + self.outputLangaugeTag + ".ttl.old" output = self.dataDirectory + "/" + self.outputLangaugeTag + "/redirects_transitive_" + self.outputLangaugeTag + ".ttl" picklename = "zh_en" if not os.path.isfile(picklename + '.pickle'): self.saveMapping(picklename) dict = fio.loadPickle(picklename) SavedStdOut = sys.stdout sys.stdout = codecs.open(output, 'wb', 'utf8') with open(input, 'r') as f: for line in f: g = re.search( "(<http://zh\.dbpedia\.org/resource/)(.*)(>\s*<)(http://zh\.dbpedia.org/resource/)(.*)(>\s*.)", line) if g != None: try: if g.group(5) not in dict: print '<http://dbpedia.org/resource/' + g.group( 2) + g.group( 3 ) + "http://dbpedia.org/resource/" + g.group( 5) + g.group(6) else: print '<http://dbpedia.org/resource/' + g.group( 2) + g.group( 3) + "http://dbpedia.org/resource/" + dict[ g.group(5)] + g.group(6) except Exception: continue sys.stdout = SavedStdOut