def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ activity = parts_dict["activity"] creator_node = join_uri('/s/contributor/omcs', standardize_text(parts_dict["creator"])) activity_node = join_uri('/s/activity/omcs', standardize_text(activity)) if preposition_fix: conjunction = [creator_node, activity_node, '/s/rule/preposition_fix'] else: conjunction = [creator_node, activity_node] weighted_sources = [(conjunction, 1)] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] conjunction = [ join_uri('/s/contributor/omcs', standardize_text(username)), '/s/activity/omcs/vote' ] weighted_sources.append((conjunction, vote_int)) return weighted_sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ activity = parts_dict["activity"] creator_node = join_uri( '/s/contributor/omcs', standardize_text(parts_dict["creator"]) ) activity_node = join_uri('/s/activity/omcs', standardize_text(activity)) if preposition_fix: conjunction = [creator_node, activity_node, '/s/rule/preposition_fix'] else: conjunction = [creator_node, activity_node] weighted_sources = [(conjunction, 1)] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] conjunction = [ join_uri('/s/contributor/omcs', standardize_text(username)), '/s/activity/omcs/vote' ] weighted_sources.append((conjunction, vote_int)) return weighted_sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri("/s/contributor/omcs", standardize_username(parts_dict["creator"])) creator_source["contributor"] = creator_node activity = parts_dict["activity"] activity_node = join_uri("/s/activity/omcs", standardize_text(activity)) creator_source["activity"] = activity_node if preposition_fix: creator_source["process"] = "/s/process/preposition_fix" creator_source["weight"] = 1.0 sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { "contributor": join_uri("/s/contributor/omcs", standardize_username(username)), "activity": "/s/activity/omcs/vote", "weight": float(vote_int), } sources.append(vote_source) return sources
def standardize_username(username): """ Convert usernames into a canonical form that can be used in URIs. If the username is an e-mail address, just keep the part before the @ sign. """ name = username.strip('@').split('@')[0] return standardize_text(name)
def standardize_username(username): """ Convert usernames into a canonical form that can be used in URIs. If the username is an e-mail address, just keep the part before the @ sign. """ name = username.strip("@").split("@")[0] return standardize_text(name)
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri('/s/contributor/omcs', standardize_username(parts_dict["creator"])) creator_source['contributor'] = creator_node activity = parts_dict["activity"] activity_node = join_uri('/s/activity/omcs', standardize_text(activity)) creator_source['activity'] = activity_node if preposition_fix: creator_source['process'] = '/s/process/preposition_fix' creator_source['weight'] = 1. sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { 'contributor': join_uri('/s/contributor/omcs', standardize_username(username)), 'activity': '/s/activity/omcs/vote', 'weight': float(vote_int) } sources.append(vote_source) return sources
def filtered_uri(lang, text): if lang == 'en': text = filter_stopwords(text) return concept_uri('en', standardize_text(text, english_filter)) else: return standardized_concept_uri(lang, text)
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name( 'en', name) != standardized_concept_name( 'en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri( 'en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len( name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri( 'en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write( umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file( dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
def filtered_uri(lang, text): if lang == "en": text = filter_stopwords(text) return concept_uri("en", standardize_text(text, english_filter)) else: return standardized_concept_uri(lang, text)
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri('en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len(name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri('en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))