Esempio n. 1
0
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None

        self.graph = JSONWriterGraph('../json_data/wiktionary_all')

        source = self.graph.get_or_create_node('/source/web/en.wiktionary.org')
        rule = self.graph.get_or_create_node('/source/rule/wiktionary_interlingual_definitions')
        monolingual_rule = self.graph.get_or_create_node('/source/rule/wiktionary_monolingual_definitions')
        wordsense_rule = self.graph.get_or_create_node('/source/rule/wiktionary_translation_tables')
        sense_define_rule = self.graph.get_or_create_node('/source/rule/wiktionary_define_senses')
        self.graph.justify('/', source)
        self.graph.justify('/', rule)
        self.graph.justify('/', monolingual_rule)
        self.graph.justify('/', wordsense_rule)
        self.graph.justify('/', sense_define_rule)

        self.conjunction = self.graph.get_or_create_conjunction([source, rule])
        self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule])
        self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule])
        self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule])
Esempio n. 2
0
# Assign every synset a disambiguation name, which is its gloss.
for synset in synset_senses:
    senses = sorted(synset_senses[synset])
    synset_name = labels[synset]
    synset_pos = synset.split('-')[-2]
    pos = parts_of_speech[synset_pos]
    disambig = glossary[synset]
    node = "/concept/en/%s/%s/%s" % (synset_name, pos, disambig)
    if synset not in mapping:
        mapping[synset] = node

# Map senses to the same nodes.
for sense, synset in sense_synsets.items():
    mapping[sense] = mapping[synset]

GRAPH = JSONWriterGraph('../json_data/wordnet')
source = GRAPH.get_or_create_node('/source/wordnet/3.0')
GRAPH.justify('/', source, 10)

for line in chain(
    open('wordnet-attribute.ttl'),
    open('wordnet-causes.ttl'),
    open('wordnet-classifiedby.ttl'),
    open('wordnet-entailment.ttl'),
    open('wordnet-hyponym.ttl'),
    open('wordnet-instances.ttl'),
    open('wordnet-membermeronym.ttl'),
    open('wordnet-partmeronym.ttl'),
    open('wordnet-sameverbgroupas.ttl'),
    open('wordnet-similarity.ttl'),
    open('wordnet-substancemeronym.ttl'),
Esempio n. 3
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None

        self.graph = JSONWriterGraph('../json_data/wiktionary_all')

        source = self.graph.get_or_create_node('/source/web/en.wiktionary.org')
        rule = self.graph.get_or_create_node('/source/rule/wiktionary_interlingual_definitions')
        monolingual_rule = self.graph.get_or_create_node('/source/rule/wiktionary_monolingual_definitions')
        wordsense_rule = self.graph.get_or_create_node('/source/rule/wiktionary_translation_tables')
        sense_define_rule = self.graph.get_or_create_node('/source/rule/wiktionary_define_senses')
        self.graph.justify('/', source)
        self.graph.justify('/', rule)
        self.graph.justify('/', monolingual_rule)
        self.graph.justify('/', wordsense_rule)
        self.graph.justify('/', sense_define_rule)

        self.conjunction = self.graph.get_or_create_conjunction([source, rule])
        self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule])
        self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule])
        self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule])

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos+'/'+sense
                if self.lang == 'English':
                    self.output_sense(title, self.curSense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)
    
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = self.graph.get_or_create_concept(lang, term1)
        if self.pos:
            target = self.graph.get_or_create_concept(lang, term2, self.pos)
        else:
            target = self.graph.get_or_create_concept(lang, term2)
        relation = self.graph.get_or_create_relation(relation)
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % lang,
           'license': 'CC-By-SA', 'normalized': False}
        )
        self.graph.justify(self.monolingual_conjunction, assertion)
        print unicode(assertion).encode('utf-8')

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = self.graph.get_or_create_concept(
          lang,
          unicodedata.normalize('NFKC', foreign)
        )
        target = self.graph.get_or_create_concept(
          'en', english, disambiguation
        )
        relation = self.graph.get_or_create_relation(
          'TranslationOf'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % lang,
           'license': 'CC-By-SA', 'normalized': False}
        )
        self.graph.justify(self.conjunction, assertion)
        
    def output_sense(self, english, disambiguation):
        source = self.graph.get_or_create_concept(
          'en', english, disambiguation
        )
        definition = self.graph.get_or_create_concept(
          'en', disambiguation[2:]
        )
        definition_norm = self.graph.get_or_create_concept(
          'en', english_normalize(disambiguation[2:])
        )
        relation = self.graph.get_or_create_relation(
          'DefinedAs'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, definition],
          {'dataset': 'wiktionary/en/en',
           'license': 'CC-By-SA', 'normalized': False}
        )
        norm_assertion = self.graph.get_or_create_assertion(
          relation, [source, definition_norm],
          {'dataset': 'wiktionary/en/en',
           'license': 'CC-By-SA', 'normalized': True}
        )

        self.graph.justify(self.defn_conjunction, assertion)
        self.graph.derive_normalized(assertion, norm_assertion)

    def output_translation(self, foreign, english, locale=''):
        source = self.graph.get_or_create_concept(
          self.langcode+locale,
          unicodedata.normalize('NFKC', foreign)
        )
        target = self.graph.get_or_create_concept(
          'en', english
        )
        relation = self.graph.get_or_create_relation(
          'TranslationOf'
        )
        assertion = self.graph.get_or_create_assertion(
          relation, [source, target],
          {'dataset': 'wiktionary/en/%s' % self.langcode,
           'license': 'CC-By-SA', 'normalized': False}
        )
        target_normal = self.graph.get_or_create_concept(
          'en', english_normalize(english)
        )
        assertion_normal = self.graph.get_or_create_assertion(
          relation, [source, target_normal],
          {'dataset': 'wiktionary/%s' % self.langcode,
           'license': 'CC-By-SA', 'normalized': True}
        )
        self.graph.justify(self.conjunction, assertion)
        self.graph.derive_normalized(assertion, assertion_normal)
Esempio n. 4
0
import re
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize
from pymongo import Connection
from types import *

GRAPH = JSONWriterGraph('json_data/goalnet')

goalnet = GRAPH.get_or_create_node(u'/source/rule/goalnet')
GRAPH.justify(0, goalnet)
wikihow = GRAPH.get_or_create_node(u'/source/web/www.wikihow.com')
omics = GRAPH.get_or_create_node(u'/source/activity/omics')
GRAPH.justify(0, wikihow)
GRAPH.justify(0, omics)

def output_steps(goal, steps, source):
    goal = normalize(goal).strip()
    steps = map(lambda x: normalize(x).strip(), steps)
    args = [GRAPH.get_or_create_concept('en', goal)]
    for step in steps:
        args.append(GRAPH.get_or_create_concept('en', step))
    assertion = GRAPH.get_or_create_assertion(
        '/relation/HasSteps', args,
        {'dataset': 'goalnet/en', 'license': 'CC-By-SA'}
    )
    if source == 'wikihow':
        conjunction = GRAPH.get_or_create_conjunction([wikihow, goalnet])
        GRAPH.justify(conjunction, assertion, 0.8)
    elif source == 'omics':
        conjunction = GRAPH.get_or_create_conjunction([omics, goalnet])
        GRAPH.justify(conjunction, assertion)
Esempio n. 5
0
from csc_utils.batch import queryset_foreach
from conceptnet.models import Sentence, Assertion, RawAssertion
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize as en_normalize
import simplenlp

GRAPH = JSONWriterGraph('json_data/conceptnet')

OMCS = GRAPH.get_or_create_node('/source/site/omcs')
GRAPH.justify('/', OMCS)

JA = simplenlp.get('ja')
# monkey-patch
def answer_false(*args):
    return False
JA.is_stopword_record = answer_false

def put_raw_assertion_in_graph(raw):
    try:
        lang = raw.language_id
        if raw.frame.goodness < 1: return
        if lang.startswith('zh'): return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1)
        raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2)
        frame_text = raw.frame.text
        if polarity > 0:
Esempio n. 6
0
maxscore = 0
count = 0
skipcount = 0
counts = defaultdict(int)
text_similarities = []

flag_out = open("output/flagged_assertions.txt", "w")
similar_out = open("output/text_similarity.txt", "w")
weak_out = open("output/weak_assertions.txt", "w")
good_out = open("output/ok_assertions.txt", "w")

GRAPH = None
context = source = None
if make_json:
    GRAPH = JSONWriterGraph("../json_data/verbosity")
    source = GRAPH.get_or_create_node("/source/site/verbosity")
    context = GRAPH.get_or_create_node("/context/General")
    GRAPH.justify(0, source)

for line in open("verbosity.txt"):
    if skipcount > 0:
        skipcount -= 1
        continue
    parts = line.strip().split("\t")
    if not parts:
        counts["blank"] += 1
        continue
    left, relation, right, freq, orderscore = parts[:5]

    # catch bad stuff
Esempio n. 7
0
"""
Get data from DBPedia.
"""

__author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])'

from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize_topic, un_camel_case
import urllib
import urllib2

GRAPH = JSONWriterGraph('json_data/dbpedia_data')

DBPEDIA_SOURCE = GRAPH.get_or_create_node('/source/web/dbpedia.org')
GRAPH.justify('/', DBPEDIA_SOURCE)

TYPE_ASSERTION_PROPERTIES = {
    'dataset':u'dbpedia',
    'license':u'CC-By-SA',
    'normalized':'False'
}
NORM_ASSERTION_PROPERTIES = {
    'dataset':u'dbpedia',
    'license':u'CC-By-SA',
    'normalized':'False'
}

VERBOSE = True
def show_message(message):
  if VERBOSE:
    print message
Esempio n. 8
0
import re
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize
from pymongo import Connection
from types import *

GRAPH = JSONWriterGraph('json_data/goalnet')

goalnet = GRAPH.get_or_create_node(u'/source/rule/goalnet')
GRAPH.justify(0, goalnet)
omics = GRAPH.get_or_create_node(u'/source/activity/omics')
GRAPH.justify(0, omics)

def output_steps(goal, steps, source):
    # add raw assertions
    args = []
    for step in steps:
        args.append(GRAPH.get_or_create_concept('en', step))
    raw_sequence = GRAPH.get_or_create_assertion(
        '/relation/Sequence', args,
        {'dataset': 'goalnet/en', 'license': 'CC-By-SA'}
    )
    args = [GRAPH.get_or_create_concept('en', goal)]
    args.append(raw_sequence)
    raw_assertion = GRAPH.get_or_create_assertion(
        '/relation/HasSteps', args,
        {'dataset': 'goalnet/en', 'license': 'CC-By-SA'}
    )
    # add assertions
    args = []
    goal = normalize(goal).strip().lower()
Esempio n. 9
0
"""
Parse the ReVerb dataset and put assertions to ConceptNet 5
"""
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize, normalize_topic, tokenize, untokenize
from urlparse import urlparse
import urllib
import codecs
import nltk
import os
import re

GRAPH = JSONWriterGraph('json_data/reverb')

reverb = GRAPH.get_or_create_node(u'/source/rule/reverb')
GRAPH.justify('/', reverb, 0.5)
reverb_object = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_objects')
#reverb_prep = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_prepositions')
reverb_triple = GRAPH.get_or_create_node(u'/source/rule/reverb_present_tense_triples')
wikipedia = GRAPH.get_or_create_node(u'/source/web/en.wikipedia.org')
GRAPH.justify('/', reverb_object, 0.2)
GRAPH.justify('/', reverb_triple, 0.5)
GRAPH.justify('/', wikipedia)

TYPE_WORDS = ('type', 'kind', 'sort', 'variety', 'one')

# Search for non-namespaced Wikipedia sources.
WIKIPEDIA_SOURCE = re.compile(r'(http://en.wikipedia.org/wiki/([^:]|:_)+)(\||$)')

def normalize_rel(text):
    parts = normalize(text).split()
Esempio n. 10
0
maxscore = 0
count = 0
skipcount = 0
counts = defaultdict(int)
text_similarities = []

flag_out = open('output/flagged_assertions.txt', 'w')
similar_out = open('output/text_similarity.txt', 'w')
weak_out = open('output/weak_assertions.txt', 'w')
good_out = open('output/ok_assertions.txt', 'w')

GRAPH = None
context = source = None
if make_json:
    GRAPH = JSONWriterGraph('../json_data/verbosity')
    source = GRAPH.get_or_create_node('/source/site/verbosity')
    context = GRAPH.get_or_create_node('/context/General')
    GRAPH.justify(0, source)

for line in open('verbosity.txt'):
    if skipcount > 0:
        skipcount -= 1
        continue
    parts = line.strip().split('\t')
    if not parts:
        counts['blank'] += 1
        continue
    left, relation, right, freq, orderscore = parts[:5]

    # catch bad stuff
Esempio n. 11
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ""
        self.curText = ""
        self.locales = []
        self.curRelation = None

        self.graph = JSONWriterGraph("../json_data/wiktionary_all")

        source = self.graph.get_or_create_node("/source/web/en.wiktionary.org")
        rule = self.graph.get_or_create_node("/source/rule/wiktionary_interlingual_definitions")
        monolingual_rule = self.graph.get_or_create_node("/source/rule/wiktionary_monolingual_definitions")
        wordsense_rule = self.graph.get_or_create_node("/source/rule/wiktionary_translation_tables")
        sense_define_rule = self.graph.get_or_create_node("/source/rule/wiktionary_define_senses")
        self.graph.justify("/", source)
        self.graph.justify("/", rule)
        self.graph.justify("/", monolingual_rule)
        self.graph.justify("/", wordsense_rule)
        self.graph.justify("/", sense_define_rule)

        self.conjunction = self.graph.get_or_create_conjunction([source, rule])
        self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule])
        self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule])
        self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule])

    def startElement(self, name, attrs):
        if name == "page":
            self.inArticle = True
            self.curText = []
        elif name == "title":
            self.inTitle = True
            self.curTitle = ""

    def endElement(self, name):
        if name == "page":
            self.inArticle = False
            self.handleArticle(self.curTitle, "".join(self.curText))
        elif name == "title":
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split("\n")
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith("===") and line.endswith("==="):
            pos = line.strip("= ")
            if pos == "Synonyms":
                self.curRelation = "Synonym"
            elif pos == "Antonym":
                self.curRelation = "Antonym"
            elif pos == "Related terms":
                self.curRelation = "ConceptuallyRelatedTo"
            elif pos == "Derived terms":
                self.curRelation = "DerivedFrom"
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if "s" in scripttag:
                self.locales.append("_CN")
            if "t" in scripttag:
                self.locales.append("_TW")
        elif line[0:1] == "#" and self.lang != "English" and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ":*#":
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2):
                        continue
                    if "Index:" in title:
                        continue
                    if self.langcode == "zh":
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == "----":
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or "n"
            sense = trans_top_match.group(1).split(";")[0].strip(".")
            if "translations" in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos + "/" + sense
                if self.lang == "English":
                    self.output_sense(title, self.curSense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == "English":
                # handle Chinese separately
                if lang not in ("cmn", "yue", "zh-yue", "zh"):
                    self.output_sense_translation(lang, translation, title, self.curSense)
        elif "{{trans-bottom}}" in line:
            self.curSense = None
        elif line.startswith("* ") and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation, related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if "Wik" in term1 or "Wik" in term2:
            return
        source = self.graph.get_or_create_concept(lang, term1)
        target = self.graph.get_or_create_concept(lang, term2)
        relation = self.graph.get_or_create_relation(relation)
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False},
        )
        self.graph.justify(self.monolingual_conjunction, assertion)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if lang == "zh-cn":
            lang = "zh_CN"
        elif lang == "zh-tw":
            lang = "zh_TW"
        source = self.graph.get_or_create_concept(lang, unicodedata.normalize("NFKC", foreign))
        target = self.graph.get_or_create_concept("en", english, disambiguation)
        relation = self.graph.get_or_create_relation("TranslationOf")
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False},
        )
        self.graph.justify(self.conjunction, assertion)

    def output_sense(self, english, disambiguation):
        source = self.graph.get_or_create_concept("en", english, disambiguation)
        definition = self.graph.get_or_create_concept("en", disambiguation[2:])
        definition_norm = self.graph.get_or_create_concept("en", english_normalize(disambiguation[2:]))
        relation = self.graph.get_or_create_relation("DefinedAs")
        assertion = self.graph.get_or_create_assertion(
            relation, [source, definition], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": False}
        )
        norm_assertion = self.graph.get_or_create_assertion(
            relation,
            [source, definition_norm],
            {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": True},
        )

        self.graph.justify(self.defn_conjunction, assertion)
        self.graph.derive_normalized(assertion, norm_assertion)
        print assertion.encode("utf-8")

    def output_translation(self, foreign, english, locale=""):
        source = self.graph.get_or_create_concept(self.langcode + locale, unicodedata.normalize("NFKC", foreign))
        target = self.graph.get_or_create_concept("en", english)
        relation = self.graph.get_or_create_relation("TranslationOf")
        assertion = self.graph.get_or_create_assertion(
            relation,
            [source, target],
            {"dataset": "wiktionary/en/%s" % self.langcode, "license": "CC-By-SA", "normalized": False},
        )
        target_normal = self.graph.get_or_create_concept("en", english_normalize(english))
        assertion_normal = self.graph.get_or_create_assertion(
            relation,
            [source, target_normal],
            {"dataset": "wiktionary/%s" % self.langcode, "license": "CC-By-SA", "normalized": True},
        )
        self.graph.justify(self.conjunction, assertion)
        self.graph.derive_normalized(assertion, assertion_normal)
Esempio n. 12
0
"""
Parse the ReVerb dataset and put assertions to ConceptNet 5
"""
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize, normalize_topic, tokenize, untokenize
from urlparse import urlparse
import urllib
import codecs
import nltk
import os
import re

GRAPH = JSONWriterGraph('json_data/reverb')

reverb = GRAPH.get_or_create_node(u'/source/rule/reverb')
GRAPH.justify('/', reverb, 0.5)
reverb_object = GRAPH.get_or_create_node(
    u'/source/rule/extract_reverb_objects')
#reverb_prep = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_prepositions')
reverb_triple = GRAPH.get_or_create_node(
    u'/source/rule/reverb_present_tense_triples')
wikipedia = GRAPH.get_or_create_node(u'/source/web/en.wikipedia.org')
GRAPH.justify('/', reverb_object, 0.2)
GRAPH.justify('/', reverb_triple, 0.5)
GRAPH.justify('/', wikipedia)

TYPE_WORDS = ('type', 'kind', 'sort', 'variety', 'one')

# Search for non-namespaced Wikipedia sources.
WIKIPEDIA_SOURCE = re.compile(
    r'(http://en.wikipedia.org/wiki/([^:]|:_)+)(\||$)')