def main_entity_link_text(): globals.read_configuration('config.cfg') entity_linker = globals.get_entity_linker() parser = globals.get_parser() from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() globals.logger.setLevel("DEBUG") import operator while True: print "Please enter some text: " text = sys.stdin.readline().strip().decode('utf-8') tokens = parser.parse(text).tokens print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5) entities = {} tokens = {} if text in question_search_results: for doc in question_search_results[text][:10]: print doc title = doc.title snippet = doc.snippet snippet_tokens = parser.parse(title + "\n" + snippet).tokens for token in snippet_tokens: if token.lemma not in tokens: tokens[token.lemma] = 0 tokens[token.lemma] += 1 for entity in entity_linker.identify_entities_in_document(snippet_tokens): if entity['mid'] not in entities: entities[entity['mid']] = entity else: entities[entity['mid']]['count'] += entity['count'] print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
def find_entity_mentions(text, use_tagme=False): if use_tagme: import urllib, httplib, json params = urllib.urlencode({ # Request parameters 'text': text, }) data = None try: host, port = globals.config.get("EntityLinker", "tagme-service-url").split(":") conn = httplib.HTTPConnection(host, port) conn.request("GET", "/get_entities?%s" % params) response = conn.getresponse() data = response.read() conn.close() except Exception as ex: logger.error(ex.message) return [] if not data: return [] return [{'mid': e['entity'], 'name': e['entity'], 'surface_score': float(e['coherence']), 'score': float(e['rho']), 'positions': (e['start'], e['end']), 'count': 1} for e in json.loads(data)] else: entity_linker = globals.get_entity_linker() parser = globals.get_parser() tokens = parser.parse(text).tokens return entity_linker.identify_entities_in_document(tokens, max_token_window=5, get_main_name=True)
def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def get_from_config(cls, config_params): sparql_backend = globals.get_sparql_backend(config_params) query_extender = QueryCandidateExtender.init_from_config() entity_linker = globals.get_entity_linker() parser = globals.get_parser() scorer_obj = ranker.SimpleScoreRanker('DefaultScorer') ngram_notable_types_npmi_path = config_params.get('QueryCandidateExtender', 'ngram-notable-types-npmi', '') notable_types_npmi_threshold = float(config_params.get('QueryCandidateExtender', 'notable-types-npmi-threshold')) ngram_notable_types_npmi = None if ngram_notable_types_npmi_path and os.path.exists(ngram_notable_types_npmi_path): import cPickle as pickle try: with open(ngram_notable_types_npmi_path, 'rb') as inp: logger.info("Loading types model from disk...") ngram_notable_types_npmi = pickle.load(inp) except IOError as exc: logger.error("Error reading types model: %s" % str(exc)) ngram_notable_types_npmi = None return SparqlQueryTranslator(sparql_backend, query_extender, entity_linker, parser, scorer_obj, ngram_notable_types_npmi, notable_types_npmi_threshold)
type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out) if __name__ == "__main__": extract_npmi_ngram_type_pairs() exit() globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] data = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): tokens = [token.token for token in parser.parse(query.utterance).tokens] answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities] data.append((tokens, notable_types))