Beispiel #1
0
def main_entity_link_text():
    globals.read_configuration('config.cfg')
    entity_linker = globals.get_entity_linker()
    parser = globals.get_parser()
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    globals.logger.setLevel("DEBUG")
    import operator
    while True:
        print "Please enter some text: "
        text = sys.stdin.readline().strip().decode('utf-8')
        tokens = parser.parse(text).tokens
        print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5)
        entities = {}
        tokens = {}

        if text in question_search_results:
            for doc in question_search_results[text][:10]:
                print doc
                title = doc.title
                snippet = doc.snippet
                snippet_tokens = parser.parse(title + "\n" + snippet).tokens
                for token in snippet_tokens:
                    if token.lemma not in tokens:
                        tokens[token.lemma] = 0
                    tokens[token.lemma] += 1
                for entity in entity_linker.identify_entities_in_document(snippet_tokens):
                    if entity['mid'] not in entities:
                        entities[entity['mid']] = entity
                    else:
                        entities[entity['mid']]['count'] += entity['count']
        print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
Beispiel #2
0
def find_entity_mentions(text, use_tagme=False):
    if use_tagme:
        import urllib, httplib, json
        params = urllib.urlencode({
            # Request parameters
            'text': text,
        })

        data = None
        try:
            host, port = globals.config.get("EntityLinker", "tagme-service-url").split(":")
            conn = httplib.HTTPConnection(host, port)
            conn.request("GET", "/get_entities?%s" % params)
            response = conn.getresponse()
            data = response.read()
            conn.close()
        except Exception as ex:
            logger.error(ex.message)
            return []
        if not data:
            return []
        return [{'mid': e['entity'],
                'name': e['entity'],
                'surface_score': float(e['coherence']),
                'score': float(e['rho']),
                'positions': (e['start'], e['end']),
                'count': 1} for e in json.loads(data)]
    else:
        entity_linker = globals.get_entity_linker()
        parser = globals.get_parser()
        tokens = parser.parse(text).tokens
        return entity_linker.identify_entities_in_document(tokens, max_token_window=5, get_main_name=True)
Beispiel #3
0
    def get_snippet_entities(self):
        doc_snippet_entities = get_documents_snippet_entities()
        if self.url in doc_snippet_entities:
            return doc_snippet_entities[self.url]

        logger.warning("Didn't find cached document snippet entities.")
        # If we didn't find snippet entities in cache, use entity linker.
        if self.snippet_entities is None:
            entity_linker = globals.get_entity_linker()
            self.snippet_entities = dict(
                (entity['name'].lower(), entity)
                for entity in entity_linker.identify_entities_in_document(self.get_snippet_tokens(),
                                                                          max_token_window=4,
                                                                          min_surface_score=0.5))
        return self.snippet_entities
Beispiel #4
0
def main_doc_entities_from_content():
    entity_linker = globals.get_entity_linker()
    document_entities_file = globals.config.get('WebSearchFeatures', 'documents-entities-file')
    doc_entities = dict()
    from text2kb.utils import get_documents_content_dict
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    documents_content = get_documents_content_dict(return_parsed_tokens=True)
    index = 0
    for serp in question_search_results.itervalues():
        for doc in serp[:globals.SEARCH_RESULTS_TOPN]:
            if doc.url in documents_content:
                doc_entities[doc.url] = entity_linker.identify_entities_in_document(documents_content[doc.url],
                                                                                    min_surface_score=0.5)
        index += 1
        if index % 100 == 0:
            logger.info("%d SERPs processed" % index)
    with open(document_entities_file, 'wx') as out:
        pickle.dump(doc_entities, out)
Beispiel #5
0
def entity_link_snippets():
    entity_linker = globals.get_entity_linker()
    snippet_entities_file = globals.config.get('WebSearchFeatures', 'document-snippet-entities')
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    doc_snippet_entities = dict()
    for index, serp in enumerate(question_search_results.itervalues()):
        for doc in serp[:globals.SEARCH_RESULTS_TOPN]:
            snippet_tokens = doc.get_snippet_tokens()
            entities = entity_linker.identify_entities_in_document(snippet_tokens)
            for entity in entities:
                entity['matches'] = []
                for position in entity['positions']:
                    entity['matches'].append(snippet_tokens[position[0]:position[1]])
            doc_snippet_entities[doc.url] = entities
        if index % 100 == 0:
            logger.info("Processed %d serps" % index)
    logger.info("Pickling the dictionary...")
    with open(snippet_entities_file, 'wx') as out:
        pickle.dump(doc_snippet_entities, out)
    logger.info("Pickling the dictionary DONE!")
Beispiel #6
0
    def get_from_config(cls, config_params):
        sparql_backend = globals.get_sparql_backend(config_params)
        query_extender = QueryCandidateExtender.init_from_config()
        entity_linker = globals.get_entity_linker()
        parser = globals.get_parser()
        scorer_obj = ranker.SimpleScoreRanker('DefaultScorer')
        ngram_notable_types_npmi_path = config_params.get('QueryCandidateExtender', 'ngram-notable-types-npmi', '')
        notable_types_npmi_threshold = float(config_params.get('QueryCandidateExtender', 'notable-types-npmi-threshold'))
        ngram_notable_types_npmi = None
        if ngram_notable_types_npmi_path and os.path.exists(ngram_notable_types_npmi_path):
            import cPickle as pickle
            try:
                with open(ngram_notable_types_npmi_path, 'rb') as inp:
                    logger.info("Loading types model from disk...")
                    ngram_notable_types_npmi = pickle.load(inp)
            except IOError as exc:
                logger.error("Error reading types model: %s" % str(exc))
                ngram_notable_types_npmi = None

        return SparqlQueryTranslator(sparql_backend, query_extender,
                                     entity_linker, parser, scorer_obj,
                                     ngram_notable_types_npmi,
                                     notable_types_npmi_threshold)