class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.graph = JSONWriterGraph('../json_data/wiktionary_all') source = self.graph.get_or_create_node('/source/web/en.wiktionary.org') rule = self.graph.get_or_create_node('/source/rule/wiktionary_interlingual_definitions') monolingual_rule = self.graph.get_or_create_node('/source/rule/wiktionary_monolingual_definitions') wordsense_rule = self.graph.get_or_create_node('/source/rule/wiktionary_translation_tables') sense_define_rule = self.graph.get_or_create_node('/source/rule/wiktionary_define_senses') self.graph.justify('/', source) self.graph.justify('/', rule) self.graph.justify('/', monolingual_rule) self.graph.justify('/', wordsense_rule) self.graph.justify('/', sense_define_rule) self.conjunction = self.graph.get_or_create_conjunction([source, rule]) self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule]) self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule]) self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule]) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos+'/'+sense if self.lang == 'English': self.output_sense(title, self.curSense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = self.graph.get_or_create_concept(lang, term1) if self.pos: target = self.graph.get_or_create_concept(lang, term2, self.pos) else: target = self.graph.get_or_create_concept(lang, term2) relation = self.graph.get_or_create_relation(relation) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % lang, 'license': 'CC-By-SA', 'normalized': False} ) self.graph.justify(self.monolingual_conjunction, assertion) print unicode(assertion).encode('utf-8') def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = self.graph.get_or_create_concept( lang, unicodedata.normalize('NFKC', foreign) ) target = self.graph.get_or_create_concept( 'en', english, disambiguation ) relation = self.graph.get_or_create_relation( 'TranslationOf' ) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % lang, 'license': 'CC-By-SA', 'normalized': False} ) self.graph.justify(self.conjunction, assertion) def output_sense(self, english, disambiguation): source = self.graph.get_or_create_concept( 'en', english, disambiguation ) definition = self.graph.get_or_create_concept( 'en', disambiguation[2:] ) definition_norm = self.graph.get_or_create_concept( 'en', english_normalize(disambiguation[2:]) ) relation = self.graph.get_or_create_relation( 'DefinedAs' ) assertion = self.graph.get_or_create_assertion( relation, [source, definition], {'dataset': 'wiktionary/en/en', 'license': 'CC-By-SA', 'normalized': False} ) norm_assertion = self.graph.get_or_create_assertion( relation, [source, definition_norm], {'dataset': 'wiktionary/en/en', 'license': 'CC-By-SA', 'normalized': True} ) self.graph.justify(self.defn_conjunction, assertion) self.graph.derive_normalized(assertion, norm_assertion) def output_translation(self, foreign, english, locale=''): source = self.graph.get_or_create_concept( self.langcode+locale, unicodedata.normalize('NFKC', foreign) ) target = self.graph.get_or_create_concept( 'en', english ) relation = self.graph.get_or_create_relation( 'TranslationOf' ) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % self.langcode, 'license': 'CC-By-SA', 'normalized': False} ) target_normal = self.graph.get_or_create_concept( 'en', english_normalize(english) ) assertion_normal = self.graph.get_or_create_assertion( relation, [source, target_normal], {'dataset': 'wiktionary/%s' % self.langcode, 'license': 'CC-By-SA', 'normalized': True} ) self.graph.justify(self.conjunction, assertion) self.graph.derive_normalized(assertion, assertion_normal)
continue score = (freq * 2 - 1) * (1000 - orderscore) * (1 - sls) / 1000 if score <= 0: counts["low score"] += 1 weak_out.write(line) continue count += 1 counts["success"] += 1 good_out.write(line) if count % 100 == 0: print (rel, left, right, score) if make_json: left_concept = GRAPH.get_or_create_concept("en", left) right_concept = GRAPH.get_or_create_concept("en", right) relation = GRAPH.get_or_create_node(rel) assertion = GRAPH.get_or_create_assertion( relation, [left_concept, right_concept], {"dataset": "verbosity", "license": "CC-By"} ) GRAPH.justify(source, assertion, weight=min(score, 1000) / 1000.0) GRAPH.add_context(assertion, context) print counts flag_out.close() good_out.close() weak_out.close() similar_out.close()
web_rel = resolve_prefix(parts[1]) web_obj = resolve_prefix(parts[2]) subj = mapping[web_subj] obj = mapping[web_obj] pred_label = parts[1].split(':')[-1] if pred_label in rel_mapping: mapped = rel_mapping[pred_label] if mapped.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped = mapped[1:] pred = mapped else: pred = pred_label raw = GRAPH.get_or_create_assertion( GRAPH.get_or_create_web_concept(web_rel), [GRAPH.get_or_create_web_concept(web_subj), GRAPH.get_or_create_web_concept(web_obj)], {'dataset': 'wordnet/en/3.0', 'license': 'CC-By', 'normalized': False} ) assertion = GRAPH.get_or_create_assertion( GRAPH.get_or_create_relation(pred), [GRAPH.get_or_create_concept(*subj), GRAPH.get_or_create_concept(*obj)], {'dataset': 'wordnet/en/3.0', 'license': 'CC-By', 'normalized': True} ) GRAPH.justify(source, raw) GRAPH.derive_normalized(raw, assertion) print assertion
continue score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000 if score <= 0: counts['low score'] += 1 weak_out.write(line) continue count += 1 counts['success'] += 1 good_out.write(line) if count % 100 == 0: print (rel, left, right, score) if make_json: left_concept = GRAPH.get_or_create_concept('en', left) right_concept = GRAPH.get_or_create_concept('en', right) relation = GRAPH.get_or_create_node(rel) assertion = GRAPH.get_or_create_assertion( relation, [left_concept, right_concept], {'dataset': 'verbosity', 'license': 'CC-By'} ) GRAPH.justify(source, assertion, weight=score/10.0) GRAPH.add_context(assertion, context) print counts flag_out.close() good_out.close() weak_out.close()
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = "" self.curText = "" self.locales = [] self.curRelation = None self.graph = JSONWriterGraph("../json_data/wiktionary_all") source = self.graph.get_or_create_node("/source/web/en.wiktionary.org") rule = self.graph.get_or_create_node("/source/rule/wiktionary_interlingual_definitions") monolingual_rule = self.graph.get_or_create_node("/source/rule/wiktionary_monolingual_definitions") wordsense_rule = self.graph.get_or_create_node("/source/rule/wiktionary_translation_tables") sense_define_rule = self.graph.get_or_create_node("/source/rule/wiktionary_define_senses") self.graph.justify("/", source) self.graph.justify("/", rule) self.graph.justify("/", monolingual_rule) self.graph.justify("/", wordsense_rule) self.graph.justify("/", sense_define_rule) self.conjunction = self.graph.get_or_create_conjunction([source, rule]) self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule]) self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule]) self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule]) def startElement(self, name, attrs): if name == "page": self.inArticle = True self.curText = [] elif name == "title": self.inTitle = True self.curTitle = "" def endElement(self, name): if name == "page": self.inArticle = False self.handleArticle(self.curTitle, "".join(self.curText)) elif name == "title": self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split("\n") for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith("===") and line.endswith("==="): pos = line.strip("= ") if pos == "Synonyms": self.curRelation = "Synonym" elif pos == "Antonym": self.curRelation = "Antonym" elif pos == "Related terms": self.curRelation = "ConceptuallyRelatedTo" elif pos == "Derived terms": self.curRelation = "DerivedFrom" else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if "s" in scripttag: self.locales.append("_CN") if "t" in scripttag: self.locales.append("_TW") elif line[0:1] == "#" and self.lang != "English" and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ":*#": for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if "Index:" in title: continue if self.langcode == "zh": for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == "----": self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or "n" sense = trans_top_match.group(1).split(";")[0].strip(".") if "translations" in sense.lower(): self.curSense = None else: self.curSense = pos + "/" + sense if self.lang == "English": self.output_sense(title, self.curSense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == "English": # handle Chinese separately if lang not in ("cmn", "yue", "zh-yue", "zh"): self.output_sense_translation(lang, translation, title, self.curSense) elif "{{trans-bottom}}" in line: self.curSense = None elif line.startswith("* ") and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if "Wik" in term1 or "Wik" in term2: return source = self.graph.get_or_create_concept(lang, term1) target = self.graph.get_or_create_concept(lang, term2) relation = self.graph.get_or_create_relation(relation) assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False}, ) self.graph.justify(self.monolingual_conjunction, assertion) def output_sense_translation(self, lang, foreign, english, disambiguation): if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = self.graph.get_or_create_concept(lang, unicodedata.normalize("NFKC", foreign)) target = self.graph.get_or_create_concept("en", english, disambiguation) relation = self.graph.get_or_create_relation("TranslationOf") assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False}, ) self.graph.justify(self.conjunction, assertion) def output_sense(self, english, disambiguation): source = self.graph.get_or_create_concept("en", english, disambiguation) definition = self.graph.get_or_create_concept("en", disambiguation[2:]) definition_norm = self.graph.get_or_create_concept("en", english_normalize(disambiguation[2:])) relation = self.graph.get_or_create_relation("DefinedAs") assertion = self.graph.get_or_create_assertion( relation, [source, definition], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": False} ) norm_assertion = self.graph.get_or_create_assertion( relation, [source, definition_norm], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": True}, ) self.graph.justify(self.defn_conjunction, assertion) self.graph.derive_normalized(assertion, norm_assertion) print assertion.encode("utf-8") def output_translation(self, foreign, english, locale=""): source = self.graph.get_or_create_concept(self.langcode + locale, unicodedata.normalize("NFKC", foreign)) target = self.graph.get_or_create_concept("en", english) relation = self.graph.get_or_create_relation("TranslationOf") assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % self.langcode, "license": "CC-By-SA", "normalized": False}, ) target_normal = self.graph.get_or_create_concept("en", english_normalize(english)) assertion_normal = self.graph.get_or_create_assertion( relation, [source, target_normal], {"dataset": "wiktionary/%s" % self.langcode, "license": "CC-By-SA", "normalized": True}, ) self.graph.justify(self.conjunction, assertion) self.graph.derive_normalized(assertion, assertion_normal)