def associatedWords(verb, relations):
    """
        Returns the best nb_results candidates to nounify the pattern
    """
    pattern = normalized_concept_name(default_language, verb) # Lemmatization+stemming
    uri = "/c/{0}/{1}".format(default_language, pattern)
    r = requests.get('http://127.0.0.1:8084/data/5.3' + uri, params={'limit':default_lookup_limit}).json()
    res = []
    for e in r['edges']:
        if e['rel'] in relations:
            cand = buildCandidate(pattern, e)
            if cand != None and cand.tag != -1:
                res.append(cand)
    for cand in res:
        cand.computeScore()
    computeWeight(res)
    res.sort(key = lambda x: x.score)
    nb_results = min(len(res), default_number_results)
    return {a.word for a in res[-nb_results:]}
def normalize(word):
    return normalized_concept_name('en', word)
def normalize(word):
    return normalized_concept_name('en', word)
    res = []
    for e in r["edges"]:
        if e["rel"] in relations:
            cand = buildCandidate(pattern, e)
            if cand != None and cand.tag != -1:
                res.append(cand)
    CLOCK.time_step("buildCandidate")
    for cand in res:
        cand.computeScore()
    computeWeight(res)
    res.sort(key=lambda x: x.score)
    CLOCK.time_step("weights")
    nb_results = min(len(res), default_number_results)
    return {a.word for a in res[-nb_results:]}


if __name__ == "__main__":
    end_lk = 0
    if sys.argv.count("-n") == 1:  # fix lookup limit at 100 : ./conceptnet_server.py detect -n 100
        default_lookup_limit = sys.argv[sys.argv.index("-n") + 1]
        end_lk = 2
    if len(sys.argv) < 2:
        sys.exit("Syntax: ./%s <words to search>" % sys.argv[0])
    for i in range(1, len(sys.argv) - end_lk):
        CLOCK = clock()
        tic = CLOCK.tic
        word = normalized_concept_name(default_language, sys.argv[i])  # Lemmatization+stemming
        CLOCK.time_step("lemmatization")
        print(associatedWords(word, {"/r/RelatedTo", "/r/DerivedFrom", "/r/CapableOf", "/r/Synonym"}))
        print("Total: %s\n" % str(CLOCK.tic - tic))
Example #5
0
    for e in r:
        if e['rel'] in relations:
            cand = buildCandidate(pattern, e)
            if cand != None and cand.tag != -1:
                res.append(cand)
    #for cand in res:
    #    print(cand.word + ' ' + str(cand.weight))
    CLOCK.time_step("buildCandidate")
    for cand in res:
        cand.computeScore()
    computeWeight(res)
    res.sort(key=lambda x: x.score)
    CLOCK.time_step("weights")
    return {a.word for a in res}
    #return { res[i].word for i in range(-15,0)}#size(res)-15,size(res))}
    #return {cand.word for cand in res} # duplicate, set instead
    #return sorted(nodeNN,key = functools.partial(similarity,word))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("Syntax: ./%s <word to search>" % sys.argv[0])
    CLOCK = clock()
    word = normalized_concept_name(default_language,
                                   sys.argv[1])  # Lemmatization+stemming
    CLOCK.time_step("lemmatization")
    print(
        associatedWords(
            word,
            {'/r/RelatedTo', '/r/DerivedFrom', '/r/CapableOf', '/r/Synonym'}))
Example #6
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = normalize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = normalized_concept_uri('en', labels[web_subj])
                obj_uri = normalized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if normalized_concept_name('en', name) != normalized_concept_name('en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = normalized_concept_uri('en', labels[web_subj])
                        name_text = normalize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = normalized_concept_uri('en', name, '_', disambig)
                        else:
                            alt_label = normalized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = normalized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))