def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/' + webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def output_edge(obj,writer): objsource = obj['sources'][0] if obj['arg1'].startswith(objsource): obj['arg1'] = objsource if obj['arg2'].startswith(objsource): obj['arg2'] = objsource if obj['arg1'].endswith(objsource): obj['arg1'] = objsource if obj['arg2'].endswith(objsource): obj['arg2'] = objsource start = make_concept_uri(obj['arg1'], 'en') end = make_concept_uri(obj['arg2'], 'en') if obj['rel'][0] in string.uppercase: rel = '/r/'+obj['rel'] else: rel = make_concept_uri(obj['rel'], 'en') if start.startswith('/c/en/this_') or start.startswith('/c/en/these_') or end.startswith('/c/en/this_') or end.startswith('/c/en/these_'): return context = make_concept_uri(objsource, 'en') source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_')) rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012'] surfaceText = u"[[%s]] %s [[%s]]" % (obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2']) weight = float(obj['weight']) ** 3 / 2 edge = make_edge(rel, start, end, dataset='/d/reverb/wp_frontpage', license='/l/CC/By-SA', sources=[source] + rules, context=context, surfaceText=surfaceText, weight=weight) writer.write(edge)
def output_sense_translation(self, lang, foreign, german, disambiguation): if "Wik" in foreign or "Wik" in german: return if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang) target = make_concept_uri(german, "de", disambiguation) relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split("/")[-1].replace("_", " "), ) # print surfaceText edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % lang, license="/l/CC/By-SA", sources=[SOURCE, TRANSLATE], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge)
def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri(english, 'en', disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def read_jmdict(filename, outfilename): file = open(filename) outfile = codecs.open(outfilename, 'w', encoding='utf-8') data = file.read().decode('utf-8') file.close() xml = xmltodict.parse(data) entries = xml['JMdict']['entry'] for entry in entries: headwords = [word['keb'] for word in get_list(entry, 'k_ele')] if not headwords: headwords = [word['reb'] for word in get_list(entry, 'r_ele')] for sense in get_list(entry, 'sense'): pos = get_one(sense, 'pos') glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource') for gloss in glosses: if '#text' in gloss: text = parse_gloss(gloss['#text']) if '.' not in text: lang = convert_lang_code(gloss['@xml:lang']) for head in headwords: ja_concept = make_concept_uri(head, 'ja') other_concept = make_concept_uri(text, lang) if len(other_concept.split('_')) <= 5: output_edge(outfile, ja_concept, other_concept) outfile.close()
def handle_raw_assertion(raw_assertion): edges = [] assertion, users = raw_assertion frame_id, concept1, concept2 = assertion frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame'] for user in users: sources.append('/s/contributor/petgame/' + user) edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=len(users)) edges.append(edge) return edges
def handle_raw_assertion(raw_assertion): line = raw_assertion.strip() edges = [] if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/contributor/petgame/' + user, '/s/activity/ntt/petgame'] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) edges.append(edge) return edges
def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/'+webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def output_sense_translation(self, lang, foreign, german, disambiguation): if 'Wik' in foreign or 'Wik' in german: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( german, 'de', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(raw, writer): try: lang = raw.language_id assert lang == 'ja' if raw.frame.goodness < 1: return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace( '{2}', '[[%s]]' % raw.text2) activity_node = normalize_uri(u'/s/site/nadya.jp') startText = ' '.join(JA.normalize_list(raw.text1)) endText = ' '.join(JA.normalize_list(raw.text2)) if startText != raw.text1: print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8') normalize_uri('/text/' + lang + '/' + startText) end = normalize_uri('/text/' + lang + '/' + endText) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/' + relname) else: relation = normalize_uri('/r/Not' + relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score / 5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def handle_raw_assertion(raw, writer): try: lang = raw.language_id assert lang == 'ja' if raw.frame.goodness < 1: return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2) activity_node = normalize_uri(u'/s/site/nadya.jp') startText = ' '.join(JA.normalize_list(raw.text1)) endText = ' '.join(JA.normalize_list(raw.text2)) if startText != raw.text1: print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8') normalize_uri('/text/'+lang+'/'+startText) end = normalize_uri('/text/'+lang+'/'+endText) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score/5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def handle_raw_assertion(line): if not line: return parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) yield json.dumps(edge, ensure_ascii=False)
def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def make_concept_uri_safe(term, lang, disambiguation=None): if term is None: raise ValueError('term must not be None') if lang is None: raise ValueError('lang must not be None') if '|' in term: term = term.split('|')[0] if '#' in term: term = term.split('#')[0] return make_concept_uri(term, lang, disambiguation)
def output_translation(self, foreign, english, locale=''): source = make_concept_uri(unicodedata.normalize('NFKC', foreign), self.langcode + locale) target = make_concept_uri(english, 'en') relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(raw_assertion): edges = [] assertion, users = raw_assertion frame_id, concept1, concept2 = assertion frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/'+relation surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame'] for user in users: sources.append('/s/contributor/petgame/'+user) edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=len(users)) edges.append(edge) return edges
def handle_raw_assertion(raw_assertion): line = raw_assertion.strip() edges = [] if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/'+relation surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/contributor/petgame/'+user, '/s/activity/ntt/petgame'] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) edges.append(edge) return edges
def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(line): if not line: return parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) yield json.dumps(edge, ensure_ascii=False)
def output_edge(obj, writer): objsource = obj['sources'][0] if obj['arg1'].startswith(objsource): obj['arg1'] = objsource if obj['arg2'].startswith(objsource): obj['arg2'] = objsource if obj['arg1'].endswith(objsource): obj['arg1'] = objsource if obj['arg2'].endswith(objsource): obj['arg2'] = objsource start = make_concept_uri(obj['arg1'], 'en') end = make_concept_uri(obj['arg2'], 'en') if obj['rel'][0] in string.uppercase: rel = '/r/' + obj['rel'] else: rel = make_concept_uri(obj['rel'], 'en') if start.startswith('/c/en/this_') or start.startswith( '/c/en/these_') or end.startswith('/c/en/this_') or end.startswith( '/c/en/these_'): return context = make_concept_uri(objsource, 'en') source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_')) rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012'] surfaceText = u"[[%s]] %s [[%s]]" % ( obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2']) weight = float(obj['weight'])**3 / 2 edge = make_edge(rel, start, end, dataset='/d/reverb/wp_frontpage', license='/l/CC/By-SA', sources=[source] + rules, context=context, surfaceText=surfaceText, weight=weight) writer.write(edge)
def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
if right.startswith('not '): right = right[4:] relation = 'it is not' if relation == 'it is the opposite of': relation = 'it is not' freq = int(freq) orderscore = int(orderscore) if relation == 'about the same size as': relation = 'it is about the same size as' elif relation == 'it looks like': relation = 'it is related to' rel = mapping.get(relation) reltext = relation[3:] if rel is None: rel = make_concept_uri(unicode(reltext), 'en') text = '[[%s]] %s [[%s]]' % (left, reltext, right) if relation == 'it is' and\ (right.startswith('a ') or right.startswith('an ') or right.startswith('the ')): rel = '/r/IsA' sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: #print "* %s sounds like %s (%4.4f)" % (left, right, sls) counts['text similarity'] += 1 similar_out.write('%4.4d\t%s' % (sls, line)) continue
def run_wordnet(input_dir, output_file, sw_map_file): mapping = {} labels = {} prefixes = {} glossary = {} synset_senses = defaultdict(list) synset_sense_names = defaultdict(list) sense_name_synsets = defaultdict(list) sense_synsets = defaultdict(list) parts_of_speech = { 'noun': 'n', 'verb': 'v', 'adjective': 'a', 'adjectivesatellite': 'a', 'adverb': 'r', } rel_mapping = { 'attribute': 'Attribute', 'causes': 'Causes', 'classifiedByRegion': 'HasContext', 'classifiedByUsage': 'HasContext', 'classifiedByTopic': 'HasContext', 'entails': 'Entails', 'hyponymOf': 'IsA', 'instanceOf': 'InstanceOf', 'memberMeronymOf': 'MemberOf', 'partMeronymOf': 'PartOf', 'sameVerbGroupAs': '******', 'similarTo': 'SimilarTo', 'substanceMeronymOf': '~MadeOf', 'antonymOf': 'Antonym', 'derivationallyRelated': '~DerivedFrom', 'pertainsTo': 'PertainsTo', 'seeAlso': 'RelatedTo', } def resolve_prefix(entry): prefix, name = entry.split(':') return prefixes[prefix] + name def handle_line(line): """ Get the (subj, obj, pred) parts of a line, unless it's a blank line or a prefix definition, in which case return None. """ line = line.decode('utf-8').strip() if not line: return None parts = line.split(None, 2) if parts[0] == '@prefix': prefix = parts[1].strip(': ') value = parts[2].strip('<>. ') prefixes[prefix] = value return None return parts[0], parts[1], parts[2].strip('. ') # First, get the human-readable label and gloss for every synset. for line in chain( open(input_dir + '/wordnet-synset.ttl'), open(input_dir + '/full/wordnet-wordsensesandwords.ttl'), open(input_dir + '/wordnet-glossary.ttl') ): parts = handle_line(line) if parts is None: continue if parts[1] == 'rdfs:label': subj = resolve_prefix(parts[0]) obj = parts[2].split('"')[1] labels[subj] = obj elif parts[1] == 'wn20schema:gloss': subj = resolve_prefix(parts[0]) obj = parts[2].split('"')[1] glossary[subj] = obj.split(';')[0] while '(' in glossary[subj] and ')' in glossary[subj]: glossary[subj] = re.sub(r"\([^)]+\) ?", r"", glossary[subj]) # Get the list of word senses in each synset, and make a bidirectional mapping. for line in open(input_dir + '/full/wordnet-wordsense-synset-relations.ttl'): parts = handle_line(line) if parts is None: continue if parts[1] == 'wn20schema:containsWordSense': subj = resolve_prefix(parts[0]) obj = resolve_prefix(parts[2].strip('. ')) synset_senses[subj].append(obj) sense_synsets[obj] = subj sense_name = labels[obj] synset_sense_names[subj].append(sense_name) sense_name_synsets[sense_name].append(subj) # Assign every synset a disambiguation name. for synset in synset_senses: senses = sorted(synset_senses[synset]) synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = parts_of_speech[synset_pos] disambig = glossary[synset].replace('/', '_') # TODO: take into account domains, etc. # #if len(sense_name_synsets[synset_name]) > 1: # for sense in senses: # sense_name = labels[sense] # more_synsets = sense_name_synsets[sense_name] # if len(more_synsets) == 1: # disambig = sense_name # break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = make_concept_uri(synset_name, 'en', pos+'/'+disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] sources = ['/s/wordnet/3.0'] writer = FlatEdgeWriter(output_file) sw_map = FlatEdgeWriter(sw_map_file) sw_map_used = set() for line in chain( open(input_dir + '/wordnet-attribute.ttl'), open(input_dir + '/wordnet-causes.ttl'), open(input_dir + '/wordnet-classifiedby.ttl'), open(input_dir + '/wordnet-entailment.ttl'), open(input_dir + '/wordnet-hyponym.ttl'), open(input_dir + '/wordnet-instances.ttl'), open(input_dir + '/wordnet-membermeronym.ttl'), open(input_dir + '/wordnet-partmeronym.ttl'), open(input_dir + '/wordnet-sameverbgroupas.ttl'), open(input_dir + '/wordnet-similarity.ttl'), open(input_dir + '/wordnet-substancemeronym.ttl'), open(input_dir + '/full/wordnet-antonym.ttl'), open(input_dir + '/full/wordnet-derivationallyrelated.ttl'), open(input_dir + '/full/wordnet-participleof.ttl'), open(input_dir + '/full/wordnet-pertainsto.ttl'), open(input_dir + '/full/wordnet-seealso.ttl'), ): parts = handle_line(line) if parts is None: continue web_subj = resolve_prefix(parts[0]) web_rel = resolve_prefix(parts[1]) web_obj = resolve_prefix(parts[2]) subj = mapping[web_subj] obj = mapping[web_obj] pred_label = parts[1].split(':')[-1] if pred_label in rel_mapping: mapped = rel_mapping[pred_label] if mapped.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped = mapped[1:] pred = '/r/'+mapped else: pred = '/r/wordnet/'+pred_label if (web_rel, pred) not in sw_map_used: sw_map.write({'from': web_rel, 'to': pred}) sw_map_used.add((web_rel, pred)) if (web_subj, subj) not in sw_map_used: sw_map.write({'from': web_subj, 'to': subj}) sw_map_used.add((web_subj, subj)) if (web_obj, obj) not in sw_map_used: sw_map.write({'from': web_obj, 'to': obj}) sw_map_used.add((web_obj, obj)) edge = make_edge( pred, subj, obj, '/d/wordnet/3.0', license='/l/CC/By', sources=sources, context='/ctx/all', weight=2.0 ) writer.write(edge) writer.close() sw_map.close()
def build_start(parts_dict): lang = parts_dict['lang'] startText = parts_dict["startText"] start = make_concept_uri(startText, lang) return start
def build_end(raw_assertion): lang = raw_assertion.language_id endText = ' '.join(JA.normalize_list(raw_assertion.text2)) end = make_concept_uri(endText, lang) return end
def build_start(raw_assertion): lang = raw_assertion.language_id startText = raw_assertion.text1 start = make_concept_uri(startText, lang) return start
def build_end(raw_assertion): lang = raw_assertion.language_id endText = raw_assertion.text2 end = make_concept_uri(endText, lang) return end
def build_start(parts_dict): lang = parts_dict['lang'] startText = ' '.join(JA.normalize_list(parts_dict["startText"])) start = make_concept_uri(startText, lang) return start
assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: userlocale += '/' sources = [ "/s/contributor/globalmind/%s%s" % (userlocale, username), "/s/activity/globalmind/assert" ] lang = lang_codes[obj['lcode']] start = make_concept_uri(obj['node1'], lang) end = make_concept_uri(obj['node2'], lang) rel = '/r/'+rel_change.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')
def handle_raw_assertion(raw, writer): try: lang = raw.language_id if raw.frame.goodness < 1: return if lang.startswith('zh'): return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text if polarity > 0: frame_text = frame_text.replace('{%}', '') else: frame_text = frame_text.replace('{%}', 'not') frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2) creator_node = normalize_uri(u'/s/contributor/omcs/'+raw.creator.username) activity_node = normalize_uri(u'/s/activity/omcs/'+activity) startText = raw.text1 endText = raw.text2 normalize_uri('/text/'+lang+'/'+raw.text1) end = normalize_uri('/text/'+lang+'/'+raw.text2) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) dataset = normalize_uri('/d/conceptnet/4/'+lang) sources = [([creator_node, activity_node], 1)] for vote in raw.votes.all(): sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username), normalize_uri(u'/s/activity/omcs/vote')], vote.vote)) for source_list, weight in sources: bad = False if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) if 'bedume' in ' '.join(source_list): for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES: check = '/'+flagged.replace(' ', '_') if start.endswith(check) or end.endswith(check): bad = True print "flagged:", str(raw) break if not bad: edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def run_verbosity(infile, outfile): maxscore = 0 count = 0 counts = defaultdict(int) text_similarities = [] sources = ['/s/site/verbosity'] writer = FlatEdgeWriter(outfile) for line in open(infile): parts = line.strip().split('\t') if not parts: counts['blank'] += 1 continue left, relation, right, freq, orderscore = parts[:5] # catch bad stuff flagged = False for rword in right.split(): if bad_regex_no_biscuit.match(rword): flagged = True break if flagged: #print "FLAGGED:", right counts['flag word'] += 1 continue if len(right) < 3: counts['clue too short'] += 1 continue if len(right.split()[-1]) == 1: counts['letter'] += 1 continue if right.startswith('add') or right.startswith('delete') or right.startswith('remove'): counts['flag word'] += 1 continue freq = int(freq) orderscore = int(orderscore) rel = '/r/RelatedTo' reltext = 'is related to' if right.startswith('not '): rel = '/r/Antonym' right = right[4:] reltext = 'is not' if relation == 'it is the opposite of': rel = '/r/Antonym' reltext = 'is the opposite of' rightwords = [right] if ' ' in right: rightwords.extend(right.split(' ')) sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: counts['text similarity'] += 1 continue for i, rightword in enumerate(rightwords): edge_sources = list(sources) if i > 0: edge_sources.append('/s/rule/split_words') text = '[[%s]] %s [[%s]]' % (left, reltext, rightword) sls = sounds_like_score(left, rightword) text_similarities.append(sls) if sls > 0.35: counts['text similarity'] += 1 continue score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000 if score <= 0: counts['low score'] += 1 continue #weight = math.log(1 + score/10.0) / math.log(2) weight = score / 100.0 count += 1 counts['success'] += 1 leftc = make_concept_uri(unicode(left), 'en') rightc = make_concept_uri(unicode(rightword), 'en') edge = make_edge(rel, leftc, rightc, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight=weight) writer.write(edge)
import codecs from conceptnet5.nodes import make_concept_uri from conceptnet5.edges import make_edge, MultiWriter path = "./raw_data/" sparse_pieces = [] for filename in os.listdir(path): if filename.startswith('conceptnet_zh_'): print filename writer = MultiWriter(filename.split('.')[0]) for line in codecs.open(path + filename, encoding='utf-8', errors='replace'): line = line.strip() if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/'+relation surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/contributor/petgame/'+user, '/s/activity/ntt/petgame'] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) writer.write(edge) writer.close()
if right.startswith('not '): right = right[4:] relation = 'it is not' if relation == 'it is the opposite of': relation = 'it is not' freq = int(freq) orderscore = int(orderscore) if relation == 'about the same size as': relation = 'it is about the same size as' elif relation == 'it looks like': relation = 'it is related to' rel = mapping.get(relation) reltext = relation[3:] if rel is None: rel = make_concept_uri(reltext, 'en') text = '[[%s]] %s [[%s]]' % (left, reltext, right) if relation == 'it is' and\ (right.startswith('a ') or right.startswith('an ') or right.startswith('the ')): rel = '/r/IsA' sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: #print "* %s sounds like %s (%4.4f)" % (left, right, sls) counts['text similarity'] += 1 similar_out.write('%4.4d\t%s' % (sls, line)) continue
def build_end(parts_dict): lang = parts_dict['lang'] endText = parts_dict["endText"] end = make_concept_uri(endText, lang) return end
writer = MultiWriter(filename.split('.')[0]) for line in codecs.open(filename, encoding='utf-8', errors='replace'): line = line.strip() if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = [ '/s/contributor/petgame/' + user, '/s/activity/ntt/petgame' ] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) writer.write(edge) writer.close()
def search(): keyword = request.form.get('keyword') lang = request.form.get('language') return redirect(site + web_root + make_concept_uri(keyword, lang))
def build_end(parts_dict): lang = parts_dict['lang'] endText = ' '.join(JA.normalize_list(parts_dict["endText"])) end = make_concept_uri(endText, lang) return end
pos = parts_of_speech[synset_pos] disambig = glossary[synset].replace('/', '_') # TODO: take into account domains, etc. # #if len(sense_name_synsets[synset_name]) > 1: # for sense in senses: # sense_name = labels[sense] # more_synsets = sense_name_synsets[sense_name] # if len(more_synsets) == 1: # disambig = sense_name # break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = make_concept_uri(synset_name, 'en', pos+'/'+disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] sources = ['/s/wordnet/3.0'] writer = MultiWriter('wordnet3') sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json') sw_map_used = set() for line in chain( open('raw_data/wordnet-attribute.ttl'), open('raw_data/wordnet-causes.ttl'),
def build_start(raw_assertion): lang = raw_assertion.language_id startText = ' '.join(JA.normalize_list(raw_assertion.text1)) start = make_concept_uri(startText, lang) return start
def build_start(parts_dict): lang = parts_dict["lang"] startText = " ".join(JA.normalize_list(parts_dict["startText"])) start = make_concept_uri(startText, lang) return start
def build_end(parts_dict): lang = parts_dict["lang"] endText = " ".join(JA.normalize_list(parts_dict["endText"])) end = make_concept_uri(endText, lang) return end
def build_from_dir(dirname): userdata = yaml.load_all(open(dirname + '/GMUser.yaml')) users = {} for userinfo in userdata: users[userinfo['pk']] = userinfo frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml')) frames = {} for frame in frame_data: frames[frame['pk']] = frame['fields'] assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml')) assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: userlocale += '/' sources = [ "/s/contributor/globalmind/%s%s" % (userlocale, username), "/s/activity/globalmind/assert" ] lang = lang_codes[obj['lcode']] obj['node1'] = unicode(obj['node1']) obj['node2'] = unicode(obj['node2']) start = make_concept_uri(obj['node1'], lang) end = make_concept_uri(obj['node2'], lang) rel = '/r/'+rel_change.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]') node1 = u'[[' + obj['node1'] + u']]' node2 = u'[[' + obj['node2'] + u']]' surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2) edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) yield json.dumps(edge, ensure_ascii=False) assertions[assertion['pk']] = edge translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml')) for translation in translationdata: obj = translation['fields'] assertion1 = assertions[obj['assertion1']] assertion2 = assertions[obj['assertion2']] start = assertion1['uri'] end = assertion2['uri'] rel = '/r/TranslationOf' text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '') text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '') lang1 = lang_names[get_lang(assertion1)] lang2 = lang_names[get_lang(assertion2)] surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2) userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: userlocale += '/' sources = [ "/s/contributor/globalmind/%s%s" % (userlocale, username), "/s/activity/globalmind/translate" ] edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) yield json.dumps(edge, ensure_ascii=False)