Esempio n. 1
0
    web_rel = resolve_prefix(parts[1])
    web_obj = resolve_prefix(parts[2])
    subj = mapping[web_subj]
    obj = mapping[web_obj]
    pred_label = parts[1].split(':')[-1]
    if pred_label in rel_mapping:
        mapped = rel_mapping[pred_label]
        if mapped.startswith('~'):
            subj, obj = obj, subj
            web_subj, web_obj = web_obj, web_subj
            web_rel = web_rel.replace('meronym', 'holonym')
            mapped = mapped[1:]
        pred = '/relation/'+mapped
    else:
        pred = '/relation/'+pred_label

    raw = GRAPH.get_or_create_assertion(
        GRAPH.get_or_create_web_concept(web_rel),
        [GRAPH.get_or_create_web_concept(web_subj), GRAPH.get_or_create_web_concept(web_obj)],
        {'dataset': 'wordnet/en/3.0', 'license': 'CC-By', 'normalized': False}
    )
    assertion = GRAPH.get_or_create_assertion(
        GRAPH.get_or_create_node(pred),
        [GRAPH.get_or_create_node(subj), GRAPH.get_or_create_node(obj)],
        {'dataset': 'wordnet/en/3.0', 'license': 'CC-By', 'normalized': True}
    )
    GRAPH.justify(source, raw)
    GRAPH.derive_normalized(raw, assertion)
    print assertion

Esempio n. 2
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None

        self.graph = JSONWriterGraph('../json_data/wiktionary_all')

        source = self.graph.get_or_create_node('/source/web/en.wiktionary.org')
        rule = self.graph.get_or_create_node('/source/rule/wiktionary_interlingual_definitions')
        monolingual_rule = self.graph.get_or_create_node('/source/rule/wiktionary_monolingual_definitions')
        wordsense_rule = self.graph.get_or_create_node('/source/rule/wiktionary_translation_tables')
        sense_define_rule = self.graph.get_or_create_node('/source/rule/wiktionary_define_senses')
        self.graph.justify('/', source)
        self.graph.justify('/', rule)
        self.graph.justify('/', monolingual_rule)
        self.graph.justify('/', wordsense_rule)
        self.graph.justify('/', sense_define_rule)

        self.conjunction = self.graph.get_or_create_conjunction([source, rule])
        self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule])
        self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule])
        self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule])

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos+'/'+sense
                if self.lang == 'English':
                    self.output_sense(title, self.curSense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)
    
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = self.graph.get_or_create_concept(lang, term1)
        if self.pos:
            target = self.graph.get_or_create_concept(lang, term2, self.pos)
        else:
            target = self.graph.get_or_create_concept(lang, term2)
        relation = self.graph.get_or_create_relation(relation)
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % lang,
           'license': 'CC-By-SA', 'normalized': False}
        )
        self.graph.justify(self.monolingual_conjunction, assertion)
        print unicode(assertion).encode('utf-8')

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = self.graph.get_or_create_concept(
          lang,
          unicodedata.normalize('NFKC', foreign)
        )
        target = self.graph.get_or_create_concept(
          'en', english, disambiguation
        )
        relation = self.graph.get_or_create_relation(
          'TranslationOf'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % lang,
           'license': 'CC-By-SA', 'normalized': False}
        )
        self.graph.justify(self.conjunction, assertion)
        
    def output_sense(self, english, disambiguation):
        source = self.graph.get_or_create_concept(
          'en', english, disambiguation
        )
        definition = self.graph.get_or_create_concept(
          'en', disambiguation[2:]
        )
        definition_norm = self.graph.get_or_create_concept(
          'en', english_normalize(disambiguation[2:])
        )
        relation = self.graph.get_or_create_relation(
          'DefinedAs'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, definition],
          {'dataset': 'wiktionary/en/en',
           'license': 'CC-By-SA', 'normalized': False}
        )
        norm_assertion = self.graph.get_or_create_assertion(
          relation, [source, definition_norm],
          {'dataset': 'wiktionary/en/en',
           'license': 'CC-By-SA', 'normalized': True}
        )

        self.graph.justify(self.defn_conjunction, assertion)
        self.graph.derive_normalized(assertion, norm_assertion)

    def output_translation(self, foreign, english, locale=''):
        source = self.graph.get_or_create_concept(
          self.langcode+locale,
          unicodedata.normalize('NFKC', foreign)
        )
        target = self.graph.get_or_create_concept(
          'en', english
        )
        relation = self.graph.get_or_create_relation(
          'TranslationOf'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % self.langcode,
           'license': 'CC-By-SA', 'normalized': False}
        )
        target_normal = self.graph.get_or_create_concept(
          'en', english_normalize(english)
        )
        assertion_normal = self.graph.get_or_create_assertion(
          relation, [source, target_normal],
          {'dataset': 'wiktionary/%s' % self.langcode,
           'license': 'CC-By-SA', 'normalized': True}
        )
        self.graph.justify(self.conjunction, assertion)
        self.graph.derive_normalized(assertion, assertion_normal)
Esempio n. 3
0
        weak_out.write(line)
        continue

    count += 1
    counts['success'] += 1
    good_out.write(line)
    if count % 100 == 0:
        print (rel, left, right, score)
    
    if make_json:
        left_concept = GRAPH.get_or_create_concept('en', left)
        right_concept = GRAPH.get_or_create_concept('en', right)
        relation = GRAPH.get_or_create_node(rel)
        assertion = GRAPH.get_or_create_assertion(
            relation,
            [left_concept, right_concept],
            {'dataset': 'verbosity', 'license': 'CC-By'}
        )
        GRAPH.justify(source, assertion, weight=score/10.0)
        GRAPH.add_context(assertion, context)

print counts

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open('similarity-scores.txt', 'w')
for sim in text_similarities:
    print >> simout, sim
Esempio n. 4
0
        counts["low score"] += 1
        weak_out.write(line)
        continue

    count += 1
    counts["success"] += 1
    good_out.write(line)
    if count % 100 == 0:
        print (rel, left, right, score)

    if make_json:
        left_concept = GRAPH.get_or_create_concept("en", left)
        right_concept = GRAPH.get_or_create_concept("en", right)
        relation = GRAPH.get_or_create_node(rel)
        assertion = GRAPH.get_or_create_assertion(
            relation, [left_concept, right_concept], {"dataset": "verbosity", "license": "CC-By"}
        )
        GRAPH.justify(source, assertion, weight=min(score, 1000) / 1000.0)
        GRAPH.add_context(assertion, context)

print counts

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open("similarity-scores.txt", "w")
for sim in text_similarities:
    print >> simout, sim
simout.close()
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ""
        self.curText = ""
        self.locales = []
        self.curRelation = None

        self.graph = JSONWriterGraph("../json_data/wiktionary_all")

        source = self.graph.get_or_create_node("/source/web/en.wiktionary.org")
        rule = self.graph.get_or_create_node("/source/rule/wiktionary_interlingual_definitions")
        monolingual_rule = self.graph.get_or_create_node("/source/rule/wiktionary_monolingual_definitions")
        wordsense_rule = self.graph.get_or_create_node("/source/rule/wiktionary_translation_tables")
        sense_define_rule = self.graph.get_or_create_node("/source/rule/wiktionary_define_senses")
        self.graph.justify("/", source)
        self.graph.justify("/", rule)
        self.graph.justify("/", monolingual_rule)
        self.graph.justify("/", wordsense_rule)
        self.graph.justify("/", sense_define_rule)

        self.conjunction = self.graph.get_or_create_conjunction([source, rule])
        self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule])
        self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule])
        self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule])

    def startElement(self, name, attrs):
        if name == "page":
            self.inArticle = True
            self.curText = []
        elif name == "title":
            self.inTitle = True
            self.curTitle = ""

    def endElement(self, name):
        if name == "page":
            self.inArticle = False
            self.handleArticle(self.curTitle, "".join(self.curText))
        elif name == "title":
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split("\n")
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith("===") and line.endswith("==="):
            pos = line.strip("= ")
            if pos == "Synonyms":
                self.curRelation = "Synonym"
            elif pos == "Antonym":
                self.curRelation = "Antonym"
            elif pos == "Related terms":
                self.curRelation = "ConceptuallyRelatedTo"
            elif pos == "Derived terms":
                self.curRelation = "DerivedFrom"
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if "s" in scripttag:
                self.locales.append("_CN")
            if "t" in scripttag:
                self.locales.append("_TW")
        elif line[0:1] == "#" and self.lang != "English" and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ":*#":
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2):
                        continue
                    if "Index:" in title:
                        continue
                    if self.langcode == "zh":
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == "----":
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or "n"
            sense = trans_top_match.group(1).split(";")[0].strip(".")
            if "translations" in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos + "/" + sense
                if self.lang == "English":
                    self.output_sense(title, self.curSense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == "English":
                # handle Chinese separately
                if lang not in ("cmn", "yue", "zh-yue", "zh"):
                    self.output_sense_translation(lang, translation, title, self.curSense)
        elif "{{trans-bottom}}" in line:
            self.curSense = None
        elif line.startswith("* ") and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation, related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if "Wik" in term1 or "Wik" in term2:
            return
        source = self.graph.get_or_create_concept(lang, term1)
        target = self.graph.get_or_create_concept(lang, term2)
        relation = self.graph.get_or_create_relation(relation)
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False},
        )
        self.graph.justify(self.monolingual_conjunction, assertion)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if lang == "zh-cn":
            lang = "zh_CN"
        elif lang == "zh-tw":
            lang = "zh_TW"
        source = self.graph.get_or_create_concept(lang, unicodedata.normalize("NFKC", foreign))
        target = self.graph.get_or_create_concept("en", english, disambiguation)
        relation = self.graph.get_or_create_relation("TranslationOf")
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False},
        )
        self.graph.justify(self.conjunction, assertion)

    def output_sense(self, english, disambiguation):
        source = self.graph.get_or_create_concept("en", english, disambiguation)
        definition = self.graph.get_or_create_concept("en", disambiguation[2:])
        definition_norm = self.graph.get_or_create_concept("en", english_normalize(disambiguation[2:]))
        relation = self.graph.get_or_create_relation("DefinedAs")
        assertion = self.graph.get_or_create_assertion(
            relation, [source, definition], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": False}
        )
        norm_assertion = self.graph.get_or_create_assertion(
            relation,
            [source, definition_norm],
            {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": True},
        )

        self.graph.justify(self.defn_conjunction, assertion)
        self.graph.derive_normalized(assertion, norm_assertion)
        print assertion.encode("utf-8")

    def output_translation(self, foreign, english, locale=""):
        source = self.graph.get_or_create_concept(self.langcode + locale, unicodedata.normalize("NFKC", foreign))
        target = self.graph.get_or_create_concept("en", english)
        relation = self.graph.get_or_create_relation("TranslationOf")
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % self.langcode, "license": "CC-By-SA", "normalized": False},
        )
        target_normal = self.graph.get_or_create_concept("en", english_normalize(english))
        assertion_normal = self.graph.get_or_create_assertion(
            relation,
            [source, target_normal],
            {"dataset": "wiktionary/%s" % self.langcode, "license": "CC-By-SA", "normalized": True},
        )
        self.graph.justify(self.conjunction, assertion)
        self.graph.derive_normalized(assertion, assertion_normal)