for synset in synset_senses: senses = sorted(synset_senses[synset]) synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = parts_of_speech[synset_pos] disambig = glossary[synset] node = "/concept/en/%s/%s/%s" % (synset_name, pos, disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] GRAPH = JSONWriterGraph('../json_data/wordnet') source = GRAPH.get_or_create_node('/source/wordnet/3.0') GRAPH.justify('/', source, 10) for line in chain( open('wordnet-attribute.ttl'), open('wordnet-causes.ttl'), open('wordnet-classifiedby.ttl'), open('wordnet-entailment.ttl'), open('wordnet-hyponym.ttl'), open('wordnet-instances.ttl'), open('wordnet-membermeronym.ttl'), open('wordnet-partmeronym.ttl'), open('wordnet-sameverbgroupas.ttl'), open('wordnet-similarity.ttl'), open('wordnet-substancemeronym.ttl'), open('full/wordnet-antonym.ttl'),
from csc_utils.batch import queryset_foreach from conceptnet.models import Sentence, Assertion, RawAssertion from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize as en_normalize import simplenlp GRAPH = JSONWriterGraph('json_data/conceptnet') OMCS = GRAPH.get_or_create_node('/source/site/omcs') GRAPH.justify('/', OMCS) JA = simplenlp.get('ja') # monkey-patch def answer_false(*args): return False JA.is_stopword_record = answer_false def put_raw_assertion_in_graph(raw): try: lang = raw.language_id if raw.frame.goodness < 1: return if lang.startswith('zh'): return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1) raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2) frame_text = raw.frame.text if polarity > 0:
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.graph = JSONWriterGraph('../json_data/wiktionary_all') source = self.graph.get_or_create_node('/source/web/en.wiktionary.org') rule = self.graph.get_or_create_node('/source/rule/wiktionary_interlingual_definitions') monolingual_rule = self.graph.get_or_create_node('/source/rule/wiktionary_monolingual_definitions') wordsense_rule = self.graph.get_or_create_node('/source/rule/wiktionary_translation_tables') sense_define_rule = self.graph.get_or_create_node('/source/rule/wiktionary_define_senses') self.graph.justify('/', source) self.graph.justify('/', rule) self.graph.justify('/', monolingual_rule) self.graph.justify('/', wordsense_rule) self.graph.justify('/', sense_define_rule) self.conjunction = self.graph.get_or_create_conjunction([source, rule]) self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule]) self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule]) self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule]) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos+'/'+sense if self.lang == 'English': self.output_sense(title, self.curSense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = self.graph.get_or_create_concept(lang, term1) if self.pos: target = self.graph.get_or_create_concept(lang, term2, self.pos) else: target = self.graph.get_or_create_concept(lang, term2) relation = self.graph.get_or_create_relation(relation) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % lang, 'license': 'CC-By-SA', 'normalized': False} ) self.graph.justify(self.monolingual_conjunction, assertion) print unicode(assertion).encode('utf-8') def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = self.graph.get_or_create_concept( lang, unicodedata.normalize('NFKC', foreign) ) target = self.graph.get_or_create_concept( 'en', english, disambiguation ) relation = self.graph.get_or_create_relation( 'TranslationOf' ) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % lang, 'license': 'CC-By-SA', 'normalized': False} ) self.graph.justify(self.conjunction, assertion) def output_sense(self, english, disambiguation): source = self.graph.get_or_create_concept( 'en', english, disambiguation ) definition = self.graph.get_or_create_concept( 'en', disambiguation[2:] ) definition_norm = self.graph.get_or_create_concept( 'en', english_normalize(disambiguation[2:]) ) relation = self.graph.get_or_create_relation( 'DefinedAs' ) assertion = self.graph.get_or_create_assertion( relation, [source, definition], {'dataset': 'wiktionary/en/en', 'license': 'CC-By-SA', 'normalized': False} ) norm_assertion = self.graph.get_or_create_assertion( relation, [source, definition_norm], {'dataset': 'wiktionary/en/en', 'license': 'CC-By-SA', 'normalized': True} ) self.graph.justify(self.defn_conjunction, assertion) self.graph.derive_normalized(assertion, norm_assertion) def output_translation(self, foreign, english, locale=''): source = self.graph.get_or_create_concept( self.langcode+locale, unicodedata.normalize('NFKC', foreign) ) target = self.graph.get_or_create_concept( 'en', english ) relation = self.graph.get_or_create_relation( 'TranslationOf' ) assertion = self.graph.get_or_create_assertion( relation, [source, target], {'dataset': 'wiktionary/en/%s' % self.langcode, 'license': 'CC-By-SA', 'normalized': False} ) target_normal = self.graph.get_or_create_concept( 'en', english_normalize(english) ) assertion_normal = self.graph.get_or_create_assertion( relation, [source, target_normal], {'dataset': 'wiktionary/%s' % self.langcode, 'license': 'CC-By-SA', 'normalized': True} ) self.graph.justify(self.conjunction, assertion) self.graph.derive_normalized(assertion, assertion_normal)
""" Get data from DBPedia. """ __author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])' from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize_topic, un_camel_case import urllib import urllib2 GRAPH = JSONWriterGraph('json_data/dbpedia_data') DBPEDIA_SOURCE = GRAPH.get_or_create_node('/source/web/dbpedia.org') GRAPH.justify('/', DBPEDIA_SOURCE) TYPE_ASSERTION_PROPERTIES = { 'dataset':u'dbpedia', 'license':u'CC-By-SA', 'normalized':'False' } NORM_ASSERTION_PROPERTIES = { 'dataset':u'dbpedia', 'license':u'CC-By-SA', 'normalized':'False' } VERBOSE = True def show_message(message): if VERBOSE: print message
import re from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize from pymongo import Connection from types import * GRAPH = JSONWriterGraph('json_data/goalnet') goalnet = GRAPH.get_or_create_node(u'/source/rule/goalnet') GRAPH.justify(0, goalnet) wikihow = GRAPH.get_or_create_node(u'/source/web/www.wikihow.com') omics = GRAPH.get_or_create_node(u'/source/activity/omics') GRAPH.justify(0, wikihow) GRAPH.justify(0, omics) def output_steps(goal, steps, source): goal = normalize(goal).strip() steps = map(lambda x: normalize(x).strip(), steps) args = [GRAPH.get_or_create_concept('en', goal)] for step in steps: args.append(GRAPH.get_or_create_concept('en', step)) assertion = GRAPH.get_or_create_assertion( '/relation/HasSteps', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) if source == 'wikihow': conjunction = GRAPH.get_or_create_conjunction([wikihow, goalnet]) GRAPH.justify(conjunction, assertion, 0.8) elif source == 'omics': conjunction = GRAPH.get_or_create_conjunction([omics, goalnet]) GRAPH.justify(conjunction, assertion)
import re from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize from pymongo import Connection from types import * GRAPH = JSONWriterGraph('json_data/goalnet') goalnet = GRAPH.get_or_create_node(u'/source/rule/goalnet') GRAPH.justify(0, goalnet) omics = GRAPH.get_or_create_node(u'/source/activity/omics') GRAPH.justify(0, omics) def output_steps(goal, steps, source): # add raw assertions args = [] for step in steps: args.append(GRAPH.get_or_create_concept('en', step)) raw_sequence = GRAPH.get_or_create_assertion( '/relation/Sequence', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) args = [GRAPH.get_or_create_concept('en', goal)] args.append(raw_sequence) raw_assertion = GRAPH.get_or_create_assertion( '/relation/HasSteps', args, {'dataset': 'goalnet/en', 'license': 'CC-By-SA'} ) # add assertions args = [] goal = normalize(goal).strip().lower()
maxscore = 0 count = 0 skipcount = 0 counts = defaultdict(int) text_similarities = [] flag_out = open("output/flagged_assertions.txt", "w") similar_out = open("output/text_similarity.txt", "w") weak_out = open("output/weak_assertions.txt", "w") good_out = open("output/ok_assertions.txt", "w") GRAPH = None context = source = None if make_json: GRAPH = JSONWriterGraph("../json_data/verbosity") source = GRAPH.get_or_create_node("/source/site/verbosity") context = GRAPH.get_or_create_node("/context/General") GRAPH.justify(0, source) for line in open("verbosity.txt"): if skipcount > 0: skipcount -= 1 continue parts = line.strip().split("\t") if not parts: counts["blank"] += 1 continue left, relation, right, freq, orderscore = parts[:5] # catch bad stuff flagged = False
""" Parse the ReVerb dataset and put assertions to ConceptNet 5 """ from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize, normalize_topic, tokenize, untokenize from urlparse import urlparse import urllib import codecs import nltk import os import re GRAPH = JSONWriterGraph('json_data/reverb') reverb = GRAPH.get_or_create_node(u'/source/rule/reverb') GRAPH.justify('/', reverb, 0.5) reverb_object = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_objects') #reverb_prep = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_prepositions') reverb_triple = GRAPH.get_or_create_node(u'/source/rule/reverb_present_tense_triples') wikipedia = GRAPH.get_or_create_node(u'/source/web/en.wikipedia.org') GRAPH.justify('/', reverb_object, 0.2) GRAPH.justify('/', reverb_triple, 0.5) GRAPH.justify('/', wikipedia) TYPE_WORDS = ('type', 'kind', 'sort', 'variety', 'one') # Search for non-namespaced Wikipedia sources. WIKIPEDIA_SOURCE = re.compile(r'(http://en.wikipedia.org/wiki/([^:]|:_)+)(\||$)') def normalize_rel(text): parts = normalize(text).split()
maxscore = 0 count = 0 skipcount = 0 counts = defaultdict(int) text_similarities = [] flag_out = open('output/flagged_assertions.txt', 'w') similar_out = open('output/text_similarity.txt', 'w') weak_out = open('output/weak_assertions.txt', 'w') good_out = open('output/ok_assertions.txt', 'w') GRAPH = None context = source = None if make_json: GRAPH = JSONWriterGraph('../json_data/verbosity') source = GRAPH.get_or_create_node('/source/site/verbosity') context = GRAPH.get_or_create_node('/context/General') GRAPH.justify(0, source) for line in open('verbosity.txt'): if skipcount > 0: skipcount -= 1 continue parts = line.strip().split('\t') if not parts: counts['blank'] += 1 continue left, relation, right, freq, orderscore = parts[:5] # catch bad stuff flagged = False
# break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = ('en', synset_name, pos+'/'+disambig) if synset not in mapping: mapping[synset] = node #print "%s -> %s" % (synset_name, node) # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] GRAPH = JSONWriterGraph('../json_data/wordnet') source = GRAPH.get_or_create_node('/source/wordnet/3.0') GRAPH.justify('/', source, 10) for line in chain( open('wordnet-attribute.ttl'), open('wordnet-causes.ttl'), open('wordnet-classifiedby.ttl'), open('wordnet-entailment.ttl'), open('wordnet-hyponym.ttl'), open('wordnet-instances.ttl'), open('wordnet-membermeronym.ttl'), open('wordnet-partmeronym.ttl'), open('wordnet-sameverbgroupas.ttl'), open('wordnet-similarity.ttl'), open('wordnet-substancemeronym.ttl'), open('full/wordnet-antonym.ttl'),
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = "" self.curText = "" self.locales = [] self.curRelation = None self.graph = JSONWriterGraph("../json_data/wiktionary_all") source = self.graph.get_or_create_node("/source/web/en.wiktionary.org") rule = self.graph.get_or_create_node("/source/rule/wiktionary_interlingual_definitions") monolingual_rule = self.graph.get_or_create_node("/source/rule/wiktionary_monolingual_definitions") wordsense_rule = self.graph.get_or_create_node("/source/rule/wiktionary_translation_tables") sense_define_rule = self.graph.get_or_create_node("/source/rule/wiktionary_define_senses") self.graph.justify("/", source) self.graph.justify("/", rule) self.graph.justify("/", monolingual_rule) self.graph.justify("/", wordsense_rule) self.graph.justify("/", sense_define_rule) self.conjunction = self.graph.get_or_create_conjunction([source, rule]) self.monolingual_conjunction = self.graph.get_or_create_conjunction([source, monolingual_rule]) self.wordsense_conjunction = self.graph.get_or_create_conjunction([source, wordsense_rule]) self.defn_conjunction = self.graph.get_or_create_conjunction([source, sense_define_rule]) def startElement(self, name, attrs): if name == "page": self.inArticle = True self.curText = [] elif name == "title": self.inTitle = True self.curTitle = "" def endElement(self, name): if name == "page": self.inArticle = False self.handleArticle(self.curTitle, "".join(self.curText)) elif name == "title": self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split("\n") for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith("===") and line.endswith("==="): pos = line.strip("= ") if pos == "Synonyms": self.curRelation = "Synonym" elif pos == "Antonym": self.curRelation = "Antonym" elif pos == "Related terms": self.curRelation = "ConceptuallyRelatedTo" elif pos == "Derived terms": self.curRelation = "DerivedFrom" else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if "s" in scripttag: self.locales.append("_CN") if "t" in scripttag: self.locales.append("_TW") elif line[0:1] == "#" and self.lang != "English" and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ":*#": for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if "Index:" in title: continue if self.langcode == "zh": for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == "----": self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or "n" sense = trans_top_match.group(1).split(";")[0].strip(".") if "translations" in sense.lower(): self.curSense = None else: self.curSense = pos + "/" + sense if self.lang == "English": self.output_sense(title, self.curSense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == "English": # handle Chinese separately if lang not in ("cmn", "yue", "zh-yue", "zh"): self.output_sense_translation(lang, translation, title, self.curSense) elif "{{trans-bottom}}" in line: self.curSense = None elif line.startswith("* ") and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if "Wik" in term1 or "Wik" in term2: return source = self.graph.get_or_create_concept(lang, term1) target = self.graph.get_or_create_concept(lang, term2) relation = self.graph.get_or_create_relation(relation) assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False}, ) self.graph.justify(self.monolingual_conjunction, assertion) def output_sense_translation(self, lang, foreign, english, disambiguation): if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = self.graph.get_or_create_concept(lang, unicodedata.normalize("NFKC", foreign)) target = self.graph.get_or_create_concept("en", english, disambiguation) relation = self.graph.get_or_create_relation("TranslationOf") assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % lang, "license": "CC-By-SA", "normalized": False}, ) self.graph.justify(self.conjunction, assertion) def output_sense(self, english, disambiguation): source = self.graph.get_or_create_concept("en", english, disambiguation) definition = self.graph.get_or_create_concept("en", disambiguation[2:]) definition_norm = self.graph.get_or_create_concept("en", english_normalize(disambiguation[2:])) relation = self.graph.get_or_create_relation("DefinedAs") assertion = self.graph.get_or_create_assertion( relation, [source, definition], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": False} ) norm_assertion = self.graph.get_or_create_assertion( relation, [source, definition_norm], {"dataset": "wiktionary/en/en", "license": "CC-By-SA", "normalized": True}, ) self.graph.justify(self.defn_conjunction, assertion) self.graph.derive_normalized(assertion, norm_assertion) print assertion.encode("utf-8") def output_translation(self, foreign, english, locale=""): source = self.graph.get_or_create_concept(self.langcode + locale, unicodedata.normalize("NFKC", foreign)) target = self.graph.get_or_create_concept("en", english) relation = self.graph.get_or_create_relation("TranslationOf") assertion = self.graph.get_or_create_assertion( relation, [source, target], {"dataset": "wiktionary/en/%s" % self.langcode, "license": "CC-By-SA", "normalized": False}, ) target_normal = self.graph.get_or_create_concept("en", english_normalize(english)) assertion_normal = self.graph.get_or_create_assertion( relation, [source, target_normal], {"dataset": "wiktionary/%s" % self.langcode, "license": "CC-By-SA", "normalized": True}, ) self.graph.justify(self.conjunction, assertion) self.graph.derive_normalized(assertion, assertion_normal)
""" Parse the ReVerb dataset and put assertions to ConceptNet 5 """ from conceptnet5.graph import JSONWriterGraph from conceptnet5.english_nlp import normalize, normalize_topic, tokenize, untokenize from urlparse import urlparse import urllib import codecs import nltk import os import re GRAPH = JSONWriterGraph('json_data/reverb') reverb = GRAPH.get_or_create_node(u'/source/rule/reverb') GRAPH.justify('/', reverb, 0.5) reverb_object = GRAPH.get_or_create_node( u'/source/rule/extract_reverb_objects') #reverb_prep = GRAPH.get_or_create_node(u'/source/rule/extract_reverb_prepositions') reverb_triple = GRAPH.get_or_create_node( u'/source/rule/reverb_present_tense_triples') wikipedia = GRAPH.get_or_create_node(u'/source/web/en.wikipedia.org') GRAPH.justify('/', reverb_object, 0.2) GRAPH.justify('/', reverb_triple, 0.5) GRAPH.justify('/', wikipedia) TYPE_WORDS = ('type', 'kind', 'sort', 'variety', 'one') # Search for non-namespaced Wikipedia sources. WIKIPEDIA_SOURCE = re.compile( r'(http://en.wikipedia.org/wiki/([^:]|:_)+)(\||$)')