def process_spotlight_api(text):
    try:
        entities = spotlight.annotate(
            "http://spotlight.dbpedia.org/rest/annotate",
            text,
            confidence=0.1,
            support=0
        )
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        occ = 0
        if occ is not 0:
            occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1)
        start = entity["offset"] + occ
        end = entity["offset"] + len(entity["surfaceForm"]) + occ

        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= start and link_match["end"] >= end:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["surfaceForm"],
                "startOffset": start,
                "endOffset": end,
                "confidence": entity["similarityScore"],
                "provenance": "dbpediaspotlight",
                "types": []
            }

            types = []
            for data_type in entity["types"].split(","):
                link = data_type
                if "DBpedia:" in data_type:
                    link = "http://en.dbpedia.org/resource/" + data_type.split(":")[1]
                if "Freebase:" in data_type:
                    link = "http://www.freebase.com" + data_type.split(":")[1]

                dbpedia_type = {
                    "typeURI": None,
                    "typeLabel": data_type,
                    "entityURI": link,
                    "confidence": entity["similarityScore"],
                    "wikiURI": DbpediaLink.get_english_wikipedia_link_from_english_resource(link)
                }
                types.append(dbpedia_type)

            e["types"].append(types)
            initial_entities.append(e)

    return initial_entities
def process_textrazor_api(text):
    client = TextRazor(
        api_key='67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800',
        extractors=[
            "entities",
            "topics",
            "words",
            "phrases",
            "dependency-trees",
            "senses"
        ]
    )

    try:
        response = client.analyze(text)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in response.entities():
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity.matched_text,
                "startOffset": entity.starting_position,
                "endOffset": entity.ending_position,
                "confidence": entity.confidence_score,
                "relevance": entity.relevance_score,
                "provenance": "textrazor",
                "wikipediaLink": entity.wikipedia_link,
                "types": []
            }

            for dbpedia_type in entity.dbpedia_types:
                wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type

                dbpedia_type_list = {
                    "typeURI": None,
                    "typeLabel": dbpedia_type,
                    "wikiURI": wiki_link,
                    "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link),
                    "confidence": entity.confidence_score
                }

                e["types"].append(dbpedia_type_list)

            for freebase_type in entity.freebase_types:
                freebase_link = "http://www.freebase.com" + freebase_type

                freebase_type_list = {
                    "typeURI": None,
                    "typeLabel": "Freebase:" + freebase_type.replace(" ", ""),
                    "wikiURI": None,
                    "entityURI": freebase_link,
                    "confidence": entity.confidence_score
                }

                e["types"].append(freebase_type_list)

            wiki_type_list = {
                "typeURI": None,
                "typeLabel": [],
                "wikiURI": entity.wikipedia_link,
                "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link),
                "confidence": entity.confidence_score
            }

            e["types"].append(wiki_type_list)

            initial_entities.append(e)

    return initial_entities
Example #3
0
def process_textrazor_api(text):
    client = TextRazor(
        api_key="67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800",
        extractors=["entities", "topics", "words", "phrases", "dependency-trees", "senses"],
    )

    try:
        response = client.analyze(text)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in response.entities():
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity.matched_text,
                "startOffset": entity.starting_position,
                "endOffset": entity.ending_position,
                "confidence": entity.confidence_score,
                "relevance": entity.relevance_score,
                "provenance": "textrazor",
                "wikipediaLink": entity.wikipedia_link,
                "types": [],
            }

            for dbpedia_type in entity.dbpedia_types:
                wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type

                dbpedia_type_list = {
                    "typeURI": None,
                    "typeLabel": dbpedia_type,
                    "wikiURI": wiki_link,
                    "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link),
                    "confidence": entity.confidence_score,
                }

                e["types"].append(dbpedia_type_list)

            for freebase_type in entity.freebase_types:
                freebase_link = "http://www.freebase.com" + freebase_type

                freebase_type_list = {
                    "typeURI": None,
                    "typeLabel": "Freebase:" + freebase_type.replace(" ", ""),
                    "wikiURI": None,
                    "entityURI": freebase_link,
                    "confidence": entity.confidence_score,
                }

                e["types"].append(freebase_type_list)

            wiki_type_list = {
                "typeURI": None,
                "typeLabel": [],
                "wikiURI": entity.wikipedia_link,
                "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link),
                "confidence": entity.confidence_score,
            }

            e["types"].append(wiki_type_list)

            initial_entities.append(e)

    return initial_entities
Example #4
0
def process_nerd_api(text):
    try:
        timeout = 10
        text = urllib.quote_plus(text)
        n = nerd.NERD("nerd.eurecom.fr", "akkqfgos0p85mcubcfgp82rn92d23enu")
        entities = n.extract(text, "combined", timeout)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity["startChar"] and link_match["end"] >= entity["endChar"]:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["label"],
                "startOffset": entity["startChar"],
                "endOffset": entity["endChar"],
                "confidence": entity["confidence"],
                "provenance": "nerd-" + entity["extractor"],
                "types": [],
            }

            if entity["extractorType"]:
                all_types = entity["extractorType"].split(",")

                for extracted_type in all_types:

                    if "dbpedia" in extracted_type:
                        type_data = {
                            "typeURI": extracted_type,
                            "typeLabel": None,
                            "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                            "entityURI": entity["uri"],
                            "confidence": entity["confidence"],
                        }
                    else:
                        type_data = {
                            "typeURI": None,
                            "typeLabel": extracted_type,
                            "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                            "entityURI": entity["uri"],
                            "confidence": entity["confidence"],
                        }

                    e["types"].append(type_data)

                if entity["nerdType"]:
                    nerd_type_data = {
                        "typeURI": entity["nerdType"],
                        "typeLabel": entity["nerdType"].split("#")[1],
                        "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                        "entityURI": entity["uri"],
                        "confidence": entity["confidence"],
                    }

                    e["types"].append(nerd_type_data)

            initial_entities.append(e)

    return initial_entities