def __init__(self): self.related = RelatedEntities() self.searcher = LuceneSearcher(settings.LUCENE_PATH) try: with open(settings.CONNECTION_CACHE_PATH) as cache_file: self.connection_cache = json.load(cache_file) except IOError: self.connection_cache = {} try: with open(settings.QUERY_ENTITY_CACHE_PATH) as cache_file: self.query_entity_cache = json.load(cache_file) except IOError: self.query_entity_cache = {}
class Connector(object): def __init__(self): self.related = RelatedEntities() self.searcher = LuceneSearcher(settings.LUCENE_PATH) try: with open(settings.CONNECTION_CACHE_PATH) as cache_file: self.connection_cache = json.load(cache_file) except IOError: self.connection_cache = {} try: with open(settings.QUERY_ENTITY_CACHE_PATH) as cache_file: self.query_entity_cache = json.load(cache_file) except IOError: self.query_entity_cache = {} def get_query_entities(self, query): query_entities = [result[1]['id'] for result in self.query_search(query)] logger.debug("Query entities: %r", query_entities) return query_entities def query_search(self, query): """ Find entities that are contained as substrings in the query. """ result = self.query_entity_cache.get(query) if result is not None: logger.info("Found %d entities in query cache", len(result)) return result normalised_query = ALLOWED_CHARS_PATTERN.sub(' ', query) query_terms = [term for term in normalised_query.split() if term not in STOPWORDS] logger.debug("Getting query entities for query terms: %r", query_terms) all_entities = [] subqueries = [] for i in range(len(query_terms) - 1): subqueries.append(' '.join(query_terms[i:i+2])) subqueries += query_terms for subquery in subqueries: logger.debug("Applying subquery %r", subquery) docs = self.searcher.search(subquery) filtered_docs = [doc for doc in docs if Levenshtein.ratio(doc['text'].lower(), unicode(subquery)) > 0.8] logger.debug("Filtered to %d of %d docs", len(filtered_docs), len(docs)) scores_docs = [(self.related.get_entity_score(doc['id']), doc) for doc in filtered_docs] sorted_scores_docs = sorted(scores_docs, reverse=True) logger.debug("Sorted scores and docs: %r", sorted_scores_docs) all_entities += sorted_scores_docs[:10] logger.debug("Found %d entities, caching", len(all_entities)) result = sorted(all_entities, reverse=True) logger.debug("Top entities: %r", result[:10]) self.query_entity_cache[query] = result return result def search(self, query, target): query_entities = self.get_query_entities(query) return self.related.connect_names(query_entities, [target]) def search_all(self, query, targets): query_entities = self.get_query_entities(query) return self.related.connect_names(query_entities, targets) def apply_connection(self, query, connection): results = self.connection_cache.get(get_cache_key(query, connection)) if results is not None: logger.debug("Found %d results in cache", len(results)) return set(results) query_entities = self.get_query_entities(query) if len(query_entities) == 0: return set() result_ids = self.related.apply_connection(query_entities, connection) results = set(self.related.get_names(result) for result in result_ids) logger.debug("Found %d results, caching", len(results)) self.connection_cache[get_cache_key(query, connection)] = list(results) return results def save_cache(self): with open(settings.CONNECTION_CACHE_PATH, 'w') as cache_file: json.dump(self.connection_cache, cache_file, indent=4) with open(settings.QUERY_ENTITY_CACHE_PATH, 'w') as cache_file: json.dump(self.query_entity_cache, cache_file, indent=4) self.related.save_cache()
return symbol.value() except AttributeError: return symbol def get_cache_key(query, connection): return '___'.join([query, '|'.join(connection)]) if __name__ == "__main__": connector = Connector() # justin = 'fb:en.justin_bieber' # jaxon = 'fb:m.0gxnnwq' data_file = open(settings.DATASET_PATH) examples = json.load(data_file) f1_scores = [] related_entities = RelatedEntities() for example in examples[100:200]: query = example['utterance'] target_data = sexpdata.loads(example['targetValue']) targets = [symbol_to_string(description[1]) for description in target_data[1:]] best_score = 0.0 for target in targets: print query, target result_ids = connector.search(query, target) print result_ids results = [related_entities.get_names(e[0]) for e in result_ids] print results score = get_f1_score(targets, results) if score > best_score: best_score = score f1_scores.append(best_score)
def test_connect_multiple_names(): related = RelatedEntities() results = related.connect_names(['fb:en.france'], ['Pouris', 'Lille']) print results assert results
def test_entity_score(): related = RelatedEntities() france_score = related.get_entity_score('fb:en.france') random_score = related.get_entity_score('fb:m.0hndhfh') print "France:", france_score, "other:", random_score assert france_score > random_score
def test_chinese_language(): related = RelatedEntities() results = related.search_exact(u"Chinese language") print results assert results
def test_connect_names(): related = RelatedEntities() results = related.connect_names(['fb:en.france'], ['Paris']) print results assert results
def test_connect_finds_jamaican_dollar(): related = RelatedEntities() results = related.connect_names(['fb:en.jamaica'], ["Jamaican dollar"]) print results assert results
class Connector(object): def __init__(self): self.related = RelatedEntities() self.searcher = LuceneSearcher(settings.LUCENE_PATH) try: with open(settings.CONNECTION_CACHE_PATH) as cache_file: self.connection_cache = json.load(cache_file) except IOError: self.connection_cache = {} try: with open(settings.QUERY_ENTITY_CACHE_PATH) as cache_file: self.query_entity_cache = json.load(cache_file) except IOError: self.query_entity_cache = {} def get_query_entities(self, query): query_entities = [ result[1]['id'] for result in self.query_search(query) ] logger.debug("Query entities: %r", query_entities) return query_entities def query_search(self, query): """ Find entities that are contained as substrings in the query. """ result = self.query_entity_cache.get(query) if result is not None: logger.info("Found %d entities in query cache", len(result)) return result normalised_query = ALLOWED_CHARS_PATTERN.sub(' ', query) query_terms = [ term for term in normalised_query.split() if term not in STOPWORDS ] logger.debug("Getting query entities for query terms: %r", query_terms) all_entities = [] subqueries = [] for i in range(len(query_terms) - 1): subqueries.append(' '.join(query_terms[i:i + 2])) subqueries += query_terms for subquery in subqueries: logger.debug("Applying subquery %r", subquery) docs = self.searcher.search(subquery) filtered_docs = [ doc for doc in docs if Levenshtein.ratio(doc['text'].lower(), unicode(subquery)) > 0.8 ] logger.debug("Filtered to %d of %d docs", len(filtered_docs), len(docs)) scores_docs = [(self.related.get_entity_score(doc['id']), doc) for doc in filtered_docs] sorted_scores_docs = sorted(scores_docs, reverse=True) logger.debug("Sorted scores and docs: %r", sorted_scores_docs) all_entities += sorted_scores_docs[:10] logger.debug("Found %d entities, caching", len(all_entities)) result = sorted(all_entities, reverse=True) logger.debug("Top entities: %r", result[:10]) self.query_entity_cache[query] = result return result def search(self, query, target): query_entities = self.get_query_entities(query) return self.related.connect_names(query_entities, [target]) def search_all(self, query, targets): query_entities = self.get_query_entities(query) return self.related.connect_names(query_entities, targets) def apply_connection(self, query, connection): results = self.connection_cache.get(get_cache_key(query, connection)) if results is not None: logger.debug("Found %d results in cache", len(results)) return set(results) query_entities = self.get_query_entities(query) if len(query_entities) == 0: return set() result_ids = self.related.apply_connection(query_entities, connection) results = set(self.related.get_names(result) for result in result_ids) logger.debug("Found %d results, caching", len(results)) self.connection_cache[get_cache_key(query, connection)] = list(results) return results def save_cache(self): with open(settings.CONNECTION_CACHE_PATH, 'w') as cache_file: json.dump(self.connection_cache, cache_file, indent=4) with open(settings.QUERY_ENTITY_CACHE_PATH, 'w') as cache_file: json.dump(self.query_entity_cache, cache_file, indent=4) self.related.save_cache()
return symbol def get_cache_key(query, connection): return '___'.join([query, '|'.join(connection)]) if __name__ == "__main__": connector = Connector() # justin = 'fb:en.justin_bieber' # jaxon = 'fb:m.0gxnnwq' data_file = open(settings.DATASET_PATH) examples = json.load(data_file) f1_scores = [] related_entities = RelatedEntities() for example in examples[100:200]: query = example['utterance'] target_data = sexpdata.loads(example['targetValue']) targets = [ symbol_to_string(description[1]) for description in target_data[1:] ] best_score = 0.0 for target in targets: print query, target result_ids = connector.search(query, target) print result_ids results = [related_entities.get_names(e[0]) for e in result_ids] print results score = get_f1_score(targets, results) if score > best_score: