def associatedWords(verb, relations): """ Returns the best nb_results candidates to nounify the pattern """ pattern = normalized_concept_name(default_language, verb) # Lemmatization+stemming uri = "/c/{0}/{1}".format(default_language, pattern) r = requests.get('http://127.0.0.1:8084/data/5.3' + uri, params={'limit':default_lookup_limit}).json() res = [] for e in r['edges']: if e['rel'] in relations: cand = buildCandidate(pattern, e) if cand != None and cand.tag != -1: res.append(cand) for cand in res: cand.computeScore() computeWeight(res) res.sort(key = lambda x: x.score) nb_results = min(len(res), default_number_results) return {a.word for a in res[-nb_results:]}
def normalize(word): return normalized_concept_name('en', word)
res = [] for e in r["edges"]: if e["rel"] in relations: cand = buildCandidate(pattern, e) if cand != None and cand.tag != -1: res.append(cand) CLOCK.time_step("buildCandidate") for cand in res: cand.computeScore() computeWeight(res) res.sort(key=lambda x: x.score) CLOCK.time_step("weights") nb_results = min(len(res), default_number_results) return {a.word for a in res[-nb_results:]} if __name__ == "__main__": end_lk = 0 if sys.argv.count("-n") == 1: # fix lookup limit at 100 : ./conceptnet_server.py detect -n 100 default_lookup_limit = sys.argv[sys.argv.index("-n") + 1] end_lk = 2 if len(sys.argv) < 2: sys.exit("Syntax: ./%s <words to search>" % sys.argv[0]) for i in range(1, len(sys.argv) - end_lk): CLOCK = clock() tic = CLOCK.tic word = normalized_concept_name(default_language, sys.argv[i]) # Lemmatization+stemming CLOCK.time_step("lemmatization") print(associatedWords(word, {"/r/RelatedTo", "/r/DerivedFrom", "/r/CapableOf", "/r/Synonym"})) print("Total: %s\n" % str(CLOCK.tic - tic))
for e in r: if e['rel'] in relations: cand = buildCandidate(pattern, e) if cand != None and cand.tag != -1: res.append(cand) #for cand in res: # print(cand.word + ' ' + str(cand.weight)) CLOCK.time_step("buildCandidate") for cand in res: cand.computeScore() computeWeight(res) res.sort(key=lambda x: x.score) CLOCK.time_step("weights") return {a.word for a in res} #return { res[i].word for i in range(-15,0)}#size(res)-15,size(res))} #return {cand.word for cand in res} # duplicate, set instead #return sorted(nodeNN,key = functools.partial(similarity,word)) if __name__ == "__main__": if len(sys.argv) != 2: sys.exit("Syntax: ./%s <word to search>" % sys.argv[0]) CLOCK = clock() word = normalized_concept_name(default_language, sys.argv[1]) # Lemmatization+stemming CLOCK.time_step("lemmatization") print( associatedWords( word, {'/r/RelatedTo', '/r/DerivedFrom', '/r/CapableOf', '/r/Synonym'}))
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = normalize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = normalized_concept_uri('en', labels[web_subj]) obj_uri = normalized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if normalized_concept_name('en', name) != normalized_concept_name('en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = normalized_concept_uri('en', labels[web_subj]) name_text = normalize_text(name) if len(label_sets[name_text]) >= 2 or len(name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = normalized_concept_uri('en', name, '_', disambig) else: alt_label = normalized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = normalized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))