def main_entity_link_text(): globals.read_configuration('config.cfg') entity_linker = globals.get_entity_linker() parser = globals.get_parser() from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() globals.logger.setLevel("DEBUG") import operator while True: print "Please enter some text: " text = sys.stdin.readline().strip().decode('utf-8') tokens = parser.parse(text).tokens print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5) entities = {} tokens = {} if text in question_search_results: for doc in question_search_results[text][:10]: print doc title = doc.title snippet = doc.snippet snippet_tokens = parser.parse(title + "\n" + snippet).tokens for token in snippet_tokens: if token.lemma not in tokens: tokens[token.lemma] = 0 tokens[token.lemma] += 1 for entity in entity_linker.identify_entities_in_document(snippet_tokens): if entity['mid'] not in entities: entities[entity['mid']] = entity else: entities[entity['mid']]['count'] += entity['count'] print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
def find_entity_mentions(text, use_tagme=False): if use_tagme: import urllib, httplib, json params = urllib.urlencode({ # Request parameters 'text': text, }) data = None try: host, port = globals.config.get("EntityLinker", "tagme-service-url").split(":") conn = httplib.HTTPConnection(host, port) conn.request("GET", "/get_entities?%s" % params) response = conn.getresponse() data = response.read() conn.close() except Exception as ex: logger.error(ex.message) return [] if not data: return [] return [{'mid': e['entity'], 'name': e['entity'], 'surface_score': float(e['coherence']), 'score': float(e['rho']), 'positions': (e['start'], e['end']), 'count': 1} for e in json.loads(data)] else: entity_linker = globals.get_entity_linker() parser = globals.get_parser() tokens = parser.parse(text).tokens return entity_linker.identify_entities_in_document(tokens, max_token_window=5, get_main_name=True)
def get_snippet_entities(self): doc_snippet_entities = get_documents_snippet_entities() if self.url in doc_snippet_entities: return doc_snippet_entities[self.url] logger.warning("Didn't find cached document snippet entities.") # If we didn't find snippet entities in cache, use entity linker. if self.snippet_entities is None: entity_linker = globals.get_entity_linker() self.snippet_entities = dict( (entity['name'].lower(), entity) for entity in entity_linker.identify_entities_in_document(self.get_snippet_tokens(), max_token_window=4, min_surface_score=0.5)) return self.snippet_entities
def main_doc_entities_from_content(): entity_linker = globals.get_entity_linker() document_entities_file = globals.config.get('WebSearchFeatures', 'documents-entities-file') doc_entities = dict() from text2kb.utils import get_documents_content_dict from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() documents_content = get_documents_content_dict(return_parsed_tokens=True) index = 0 for serp in question_search_results.itervalues(): for doc in serp[:globals.SEARCH_RESULTS_TOPN]: if doc.url in documents_content: doc_entities[doc.url] = entity_linker.identify_entities_in_document(documents_content[doc.url], min_surface_score=0.5) index += 1 if index % 100 == 0: logger.info("%d SERPs processed" % index) with open(document_entities_file, 'wx') as out: pickle.dump(doc_entities, out)
def entity_link_snippets(): entity_linker = globals.get_entity_linker() snippet_entities_file = globals.config.get('WebSearchFeatures', 'document-snippet-entities') from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() doc_snippet_entities = dict() for index, serp in enumerate(question_search_results.itervalues()): for doc in serp[:globals.SEARCH_RESULTS_TOPN]: snippet_tokens = doc.get_snippet_tokens() entities = entity_linker.identify_entities_in_document(snippet_tokens) for entity in entities: entity['matches'] = [] for position in entity['positions']: entity['matches'].append(snippet_tokens[position[0]:position[1]]) doc_snippet_entities[doc.url] = entities if index % 100 == 0: logger.info("Processed %d serps" % index) logger.info("Pickling the dictionary...") with open(snippet_entities_file, 'wx') as out: pickle.dump(doc_snippet_entities, out) logger.info("Pickling the dictionary DONE!")
def get_from_config(cls, config_params): sparql_backend = globals.get_sparql_backend(config_params) query_extender = QueryCandidateExtender.init_from_config() entity_linker = globals.get_entity_linker() parser = globals.get_parser() scorer_obj = ranker.SimpleScoreRanker('DefaultScorer') ngram_notable_types_npmi_path = config_params.get('QueryCandidateExtender', 'ngram-notable-types-npmi', '') notable_types_npmi_threshold = float(config_params.get('QueryCandidateExtender', 'notable-types-npmi-threshold')) ngram_notable_types_npmi = None if ngram_notable_types_npmi_path and os.path.exists(ngram_notable_types_npmi_path): import cPickle as pickle try: with open(ngram_notable_types_npmi_path, 'rb') as inp: logger.info("Loading types model from disk...") ngram_notable_types_npmi = pickle.load(inp) except IOError as exc: logger.error("Error reading types model: %s" % str(exc)) ngram_notable_types_npmi = None return SparqlQueryTranslator(sparql_backend, query_extender, entity_linker, parser, scorer_obj, ngram_notable_types_npmi, notable_types_npmi_threshold)