def handle_file(filename, output_file, sw_map_file): reader = NTriplesReader() out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) for line in open(filename, 'rb'): if not line.startswith(b'#'): handle_triple(line.decode('utf-8').strip(), reader, out, map_out)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ( 'wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl' ): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge( rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0 ) out.write(edge)
def handle_file(filename, output_file, sw_map_file): reader = NTriplesReader() out = JSONStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) for line in open(filename, 'rb'): handle_triple(line.decode('utf-8').strip(), reader, out, map_out)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl'): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file( filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge(rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0) out.write(edge)
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name( 'en', name) != standardized_concept_name( 'en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri( 'en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len( name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri( 'en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write( umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file( dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri('en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len(name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri('en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))