def rank_combined(rankings: Tuple[Ranking, Ranking]) -> Ranking: lambda_param = D('0.5') delta_param = D('0.1') combined_ranking: DefaultDict[URIRef, D] = defaultdict(D) ranking_text, ranking_example = rankings ap_example, ranking_example_data = ranking_example ap_text, ranking_text_data = ranking_text overlap = min(ap_example, ap_text) / max(ap_example, ap_text) L.info("Overlap = %s", overlap) if overlap < delta_param and ap_example > ap_text: return ap_example, ranking_example_data elif overlap < delta_param and ap_example < ap_text: return ap_text, ranking_text_data else: for v, entity in ranking_example_data: combined_ranking[entity] += v * lambda_param for v, entity in ranking_text_data: combined_ranking[entity] += v * (1 - lambda_param) return D(1), [(v, k) for k, v in sorted( combined_ranking.items(), key=lambda item: item[1], reverse=True)]
def _example_retrieval_model(preparsed_data: Dict[Triple, D], graph: PPGraph, entity: URIRef): """Rates entity represented as set of triples. Rate is based on the similarity of sets. Args: preparsed_data: preparsed example entities graph: RDF triples to use (graph represents whole word we know about) entity: RDF entity to rank Returns: Probability """ L.debug('Computing example-based probability for %s', entity) # sanity checks assert isinstance(graph, PPGraph), 'graph is not PPGraph' assert isinstance(entity, URIRef), ['entity is not URIRef', entity] # get set representations of the entity, e_l representation = _triples_set_representation(graph, entity) # P(e_l | theta_X) = sum(tr in X) P(e_l|tr) * P(tr|theta_X) # P(e_l|tr) = 1 if tr in e_l else 0 # P(tr|theta_X) are in preparsed_data final_probability = D(0) for tr in preparsed_data.keys(): if tr in representation: final_probability += preparsed_data[tr] L.debug('Probability: %s', final_probability) return final_probability
def test_ppgraph(data_urls: List[str]): for data_url in data_urls: L.info('Test with %s', data_url) graph = load_data(data_url) # can query results = graph.query(PREFIXES + '''SELECT DISTINCT ?s WHERE { ?s rdf:type foaf:Person } LIMIT 10''') assert len(results) == 10 # triples t_subject = URIRef('http://dbpedia.org/resource/John_Markoff') t_predicate = RDF.type t_object = URIRef('http://dbpedia.org/class/yago/LivingThing100004258') for s, p, o in graph.triples((t_subject, t_predicate, t_object)): assert s == t_subject assert p == t_predicate assert o == t_object # subject/predicate/object helpers for p, o in graph.predicate_objects((t_subject)): assert isinstance(p, URIRef) or isinstance( p, Literal) or isinstance(p, BNode) assert isinstance(o, URIRef) or isinstance( o, Literal) or isinstance(o, BNode)
def parse(self, *args, **kwargs): if isinstance(self.store, SPARQLStore): L.warning( 'Switching PPGraph backend from remote endpoint to local files' ) self.store = ConjunctiveGraph() self._size = None # will need to recompute that return self.store.parse(*args, **kwargs)
def get_and_store_data(sparql_endpoint: str, out_filename: str, entities: List[URIRef]): """Query remote endpoint for triples and save them in a local file. For every entity in the list dump triples like: entity -> w/e -> Literal entity -> w/e -> URI URI -> label -> Literal w/e -> w/e -> entity """ entities_amount = len(entities) L.info('Getting data from remote endpoint "%s" for %d entities', sparql_endpoint, entities_amount) # load the endpoint try: graph = load_data(sparql_endpoint) except Exception as e: L.error('Error when loading data from `%s`: %s', sparql_endpoint, e) return # get triples for every entity in the list for i, entity in enumerate(entities): L.info('%d / %d', i, entities_amount) result = [] # entity -> w/e -> w/e for triple_predicate, triple_object in graph.predicate_objects( subject=entity): # skip blank nodes if isinstance(triple_object, BNode): continue tr = (entity, triple_predicate, triple_object) result.append(tr) # URI -> label -> Literal if isinstance(triple_object, URIRef): try: label = graph.label(triple_object) if label: result.append((triple_object, RDFS.label, label)) except: pass # w/e -> w/e -> entity for triple_subject, triple_predicate in graph.subject_predicates( object=entity): # skip blank nodes if isinstance(triple_subject, BNode): continue result.append((triple_subject, triple_predicate, entity)) L.debug('Saving %d triples', len(result)) with open(out_filename, 'a', encoding='utf8') as f: for tr in result: f.write(' '.join(map(n3_format, tr))) f.write(' <http://dbpedia.org/>') f.write(' .\n')
def _examples_preparsing(graph: PPGraph, input_data: Query) -> Dict[Triple, D]: """Convert example entities to frequency (number of occurences). Most of the final probability depends only on examples. So we need to compute it only once (not for every entity to rank). """ # unpack query _, examples = input_data L.debug('Preparsing data for %d examples', len(examples)) # get set representations examples_representations = [] for example in examples: examples_representations.append( _triples_set_representation(graph, example)) # n(tr, x) = 1 if tr in x else 0 # denominator of P(tr|theta_X) = denominator = sum(tr in all(x in X)) sum(x in X) n(tr, x) denominator = D(0) for example_representation in examples_representations: for tr in example_representation: for x in examples_representations: if tr in x: denominator += 1 L.debug('Denominator: %s', denominator) # P(e_l | theta_X) = sum(tr in X) P(e_l|tr) * P(tr|theta_X) # P(tr|theta_X) = sum(x in X) n(tr, x) / dem # we can precompute P(tr|theta_X) preparsed_examples = dict() for example_representation in examples_representations: for tr in example_representation: nominator = D(0) for x in examples_representations: if tr in x: nominator += 1 preparsed_examples[tr] = nominator / denominator L.debug('-' * 20) return preparsed_examples
def _triples_set_representation(graph: PPGraph, entity: URIRef) -> Set[Triple]: """Creates set representation of the entity. Set contains all triples that have the entity as a subject (outlinks) or an object (inlinks). Args: graph: RDF triples to use (graph represents whole word we know about) entity: RDF entity to rank Returns: set of RDF triples """ L.debug('Computing triples set representation of %s', entity) # sanity checks assert isinstance(graph, PPGraph), ['graph is not PPGraph', graph] assert isinstance(entity, URIRef), ['entity is not URIRef', entity] result = set() # outlinks for triple_predicate, triple_object in graph.predicate_objects( subject=entity): if isinstance(triple_object, Literal): result.add((None, triple_predicate, triple_object)) elif isinstance(triple_object, URIRef): result.add((entity, triple_predicate, triple_object)) outlinks = len(result) L.debug('%s-> outlinks: %s', ' ' * 4, outlinks) # inlinks for triple_subject, triple_predicate in graph.subject_predicates( object=entity): result.add((triple_subject, triple_predicate, entity)) L.debug('%s-> inlinks: %s', ' ' * 4, len(result) - outlinks) return result
def rank(input_data: Query, preparsing_function: PreparsingFunc, retrieval_model: RetrievalModel, graph: PPGraph, entities_to_rank: List[URIRef]) \ -> Ranking: """Rates entities based on provided model and input query. Args: input_data: query, it is passed to preparsing_function preparsing_function: function that takes input_data and returns stuff for retrieval_model retrieval_model: function implementing rating graph: RDF triples to use entities_to_rank: list of entities that should be rated Returns: Ordered/sorted list containing tuples: (rate, entity), best matching entities comes first """ _, examples = input_data entities_to_rank_amount = len(entities_to_rank) entities_to_rank_progress = max(1, entities_to_rank_amount // 10) L.info('Ranking %d entities', entities_to_rank_amount) # preparse before the loop for efficiency preparsed_data = preparsing_function(graph, input_data) ranking_score: D # do the ranking ranking: List[Tuple[D, URIRef]] = [] for i, entity in enumerate(entities_to_rank): if i % entities_to_rank_progress == 0: L.info(' ~> ranking entity no %d / %d', i, entities_to_rank_amount) # score entity ranking_score = retrieval_model(preparsed_data, graph, entity) # insert and sort bisect.insort_right(ranking, (ranking_score, entity)) L.debug('-' * 20) # min/max normalization + best scored first max_val = ranking[-1][0] min_val = ranking[0][0] norm_denominator = max_val - min_val # rank examples themselves, for future use in combined approach ranking_with_examples = ranking[:] for i, entity in enumerate(examples): example_ranking = retrieval_model(preparsed_data, graph, entity) bisect.insort_right(ranking_with_examples, (example_ranking, entity)) ranking_with_examples = ranking_with_examples[::-1] retrived_with_examples: List[bool] = [] count_found_examples = 0 for _, entity in ranking_with_examples: # assumed amount of relevant entities if count_found_examples == 10: break if entity in examples: count_found_examples += 1 retrived_with_examples.append(True) else: retrived_with_examples.append(False) # average precision ap = statistical_stats(retrived_with_examples)['AvgPrec'] L.info(" ~> normalization min = %s, max = %s", min_val, max_val) L.info(" ~> AP = %s", ap) return ap, [((v - min_val) / norm_denominator, entity) for v, entity in ranking[::-1]]
def _text_representation(graph: PPGraph, entity: URIRef) -> Dict[str, DefaultDict[str, int]]: """Creates text representation of the entity. Entity is represented with triples that have the entity as a subject. Such triples are then divided into: - attributes: with literal objects - types: with 'type' predicates like /subject or /22-rdf-syntax-ns#type - links: all other Finally all URIs are expanded to text with /rdfs:label predicate. Args: graph(PPGraph) entity(URIRef) Returns: dict with keys: attributes, types, links values are lists of literals and rdf labels, as strings """ L.debug('Computing text representation of %s', entity) # sanity checks assert isinstance(graph, PPGraph), ['graph is not PPGraph', graph] assert isinstance(entity, URIRef), ['entity is not URIRef', entity] # use this URIs for types type_uris = (RDF.type, URIRef('http://www.w3.org/2004/02/skos/core#subject'), URIRef('http://purl.org/dc/elements/1.1/subject')) # store triples in sets attributes: DefaultDict[str, int] = defaultdict(int) types: DefaultDict[str, int] = defaultdict(int) links: DefaultDict[str, int] = defaultdict(int) entities_without_label = 0 # require only `threshold` objects of all type threshold = 999 # iterate over all triples with the entity as the subject for triple_predicate, triple_object in graph.predicate_objects(entity): cs_to_use = None value_to_use = None if isinstance(triple_object, Literal): cs_to_use = attributes value_to_use = triple_object elif isinstance(triple_object, URIRef): value_to_use = graph.label(triple_object) if not value_to_use or len(value_to_use) == 0: entities_without_label += 1 continue if triple_predicate in type_uris: cs_to_use = types else: cs_to_use = links else: continue for o in normalize_relation(value_to_use).split(): cs_to_use[o] += 1 if all([ sum(cs.values()) >= threshold for cs in [attributes, types, links] ]): break result = {'attributes': attributes, 'types': types, 'links': links} if entities_without_label > 0: L.debug('%d skipped, because of missing label', entities_without_label) L.debug( 'Found: %s, %s, %s', *[ ' '.join([str(sum(cs.values())), 'terms in', cs_name]) for cs_name, cs in result.items() ]) return result
def _text_retrieval_model(preparsed_data: Tuple[List[str], int], graph: PPGraph, entity: URIRef) -> D: """Rates entity represented as text. Rate is equal to the probability of the entity being relevant to the relation. Probability formula is based on a language modeling approach. Dirichlet model computation is based on: http://mlwiki.org/index.php/Smoothing_for_Language_Models#Dirichlet_Prior_Smoothing https://www.coursera.org/lecture/text-retrieval/lesson-4-6-smoothing-methods-part-1-kM6Ie http://ciir.cs.umass.edu/pubfiles/ir-445.pdf (4) http://profsite.um.ac.ir/~monsefi/machine-learning/pdf/Machine-Learning-Tom-Mitchell.pdf Args: preparsed_data: preparsed relation, precomputed dirichlet parameters graph: RDF triples to use (graph represents whole word we know about) entity: RDF entity to rank Returns: Probability """ L.debug('Computing text-based probability for %s', entity) # sanity checks assert isinstance(graph, PPGraph), 'graph is not PPGraph' assert isinstance(entity, URIRef), ['entity is not URIRef', entity] # unpack input data relation, ni = preparsed_data # get text representations of the entity, theta_e representations = _text_representation(graph, entity) # precompute number of terms representations_lengths = { cs_name: sum(cs.values()) for cs_name, cs in representations.items() } # denominator of "Dirichlet smoothed model of the entire collection of triples" # P(t|theta_c) == sum(D in theta_c)tf(t,D) / sum(D in theta_c)|D| # D = node text representation # theta_c = collection of nodes # we do not compute that, too time consuming # # probability_collection_denominator = D(0) # for node in graph.all_nodes(): # if isinstance(node, Literal): # triple_object_text = node # else: # triple_object_text = graph.label(node) # probability_collection_denominator += len(normalize_relation(triple_object_text)) # P(t|theta_c), it should depends on term t # but assume it is 1/ni, according to (4), page 182 probability_collection = D(1) / D(ni) # this are experimental representation_weights = { 'attributes': D('0.4'), 'types': D('0.4'), 'links': D('0.2') } # P(R | theta_e) == product(t in R) P(t | theta_w_e) final_probability = D('1.0') for t in relation: L.debug('%s-> processing term %s', ' ' * 4, repr(t)) # P(t | theta_w_e) == sum(cs in representations) P(t | theta_cs_e) * P(cs) term_probability = D('0.0') for cs_name, cs in representations.items(): # tf(t,e) is the term frequency of t in the representation document of e # http://mlwiki.org/index.php/TF-IDF#Term_Frequency tf = cs[t] # "Dirichlet smoothed model of the entire collection of triples" # P(t|theta_c) == sum(D in theta_c)tf(t,D) / sum(D in theta_c)|D| # we do not compute that, too time consuming # # probability_collection_nominator = D(0) # for node in graph.all_nodes(): # if isinstance(node, Literal): # triple_object_text = node # else: # triple_object_text = graph.label(node) # probability_collection_nominator += normalize_relation(triple_object_text).count(t) # probability_collection = probability_collection_nominator / probability_collection_denominator # P(t | theta_cs_e) == [tf(t,e) + ni*P(t|theta_c)] / [|e| + ni] representation_probability = D(tf + ni * probability_collection) representation_probability /= representations_lengths[cs_name] + ni L.debug('%s-> probability for %s: %s (tf=%d, |e|=%d)', ' ' * 8, cs_name, representation_probability.quantize(D_PREC), tf, representations_lengths[cs_name]) # do the addition term_probability += representation_probability * \ representation_weights[cs_name] L.debug('%s-> term probability: %s', ' ' * 8, term_probability.quantize(D_PREC)) # do the multiplication final_probability *= term_probability L.debug('Probability: %s', final_probability) return final_probability
def main(): # cmd line args parser = argparse.ArgumentParser(description='Capture data from \ remote SPARQL endpoint and save it to local file in nqads format') parser.add_argument('filename', help='File to save data in') parser.add_argument( 'sample_file', help='YAML file with entities as list of URIs under `sample_key` key') parser.add_argument( 'sample_key', default='relevant', help='YAML key for entities list') parser.add_argument( '-e', '--endpoint', dest='sparql_endpoint', default=SPARQL_ENDPOINT, help='SPARQL endpoint url') parser.add_argument("-v", "--verbose", help="debug output", action="store_true") # args parsing and sanity checks args = parser.parse_args() L.setLevel('INFO') if args.verbose: L.setLevel('DEBUG') if isfile(args.filename): L.warning('File `%s` exists, will append to it!', args.filename) if not isfile(args.sample_file): L.error('File `%s` do not exists, aborting!', args.sample_file) exit(1) try: with open(args.sample_file, 'r', encoding='utf8') as f: sample_data = safe_load(f) except YAMLError as e: L.error('Error loading sample file `%s`: %s', args.sample_file, e) exit(1) if not isinstance(sample_data, dict): L.error('Sample data must be dictionary!') exit(1) if args.sample_key not in sample_data.keys(): L.error('`%s` key not found in sample data', args.sample_key) exit(1) # do the job entities: List[URIRef] = list(map(URIRef, sample_data[args.sample_key])) get_and_store_data(args.sparql_endpoint, args.filename, entities)
def data_from_sample_file(sample_file: str) -> \ Tuple[str, List[URIRef], List[URIRef], List[URIRef]]: """Parses sample file""" L.info('Preparing ranking for sample file `%s`', sample_file) if not isfile(sample_file): L.error('File `%s` do not exists, aborting!', sample_file) raise SyntaxError try: with open(sample_file, 'r', encoding='utf8') as f: sample_data = safe_load(f) except (UnicodeDecodeError, YAMLError) as e: L.error('Error loading sample file `%s`: %s', sample_file, e) raise SyntaxError if not isinstance(sample_data, dict): L.error('Sample data must be dictionary!') raise SyntaxError for required_key in ['topic', 'relevant', 'not_relevant']: if required_key not in sample_data.keys(): L.error('`%s` key not found in sample data', required_key) raise SyntaxError # convert strings to URIRefs and prepare data relevant = list(map(URIRef, sample_data['relevant'])) not_relevant = list(map(URIRef, sample_data['not_relevant'])) if len(relevant) == 0: L.error('No relevant entities specified in the sample data') raise SyntaxError examples_amount = EXAMPLES_AMOUNT random_examples = True if 'examples' in sample_data: try: examples_amount = int(sample_data['examples']) random_examples = False L.info('Using top %d entities as examples', examples_amount) except Exception as e: L.error('Error reading amount of examples from YAML file: %s', e) if len(relevant) <= examples_amount: L.warning( 'There is only %d relevant entities in sample data `%s`, trimming amount of examples', len(relevant), sample_file) # select random examples from relevant entities if random_examples: shuffle(relevant) # prepare entities examples = relevant[:examples_amount] relevant = relevant[examples_amount:] entities_to_rank = relevant[:] + not_relevant[:] return sample_data['topic'], examples, entities_to_rank, relevant
def load_data(data_url: str, old_graph: Optional[PPGraph] = None) -> PPGraph: """Create new PPGraph or add triples to the provided one. Args: data_url: path to RDF file or url address of SPARQL endpoint, passing an url will invalidate old_graph old_graph: existing graph, will add triples to it Returns: Graph with triples loaded from data_url (lazy loaded in case of SPARQL endpoint) """ if old_graph: graph = old_graph else: graph = PPGraph(ConjunctiveGraph()) if isfile(data_url): L.info('Loading triples from file `%s`', data_url) data_format = guess_format(data_url) graph.parse(data_url, format=data_format) elif isdir(data_url): L.info('Loading triples from files in directory `%s`', data_url) for extension in TRIPLE_FILE_EXTENSIONS: triples_files = glob(f'{data_url}/*.{extension}') if len(triples_files) > 0: L.info('Found %d `.%s` files', len(triples_files), extension) for i, triples_file in enumerate(triples_files): data_format = guess_format(triples_file) L.debug('%d / %d (`%s`), data format: %s', i, len(triples_files), triples_file, data_format) graph.parse(triples_file, format=data_format) else: L.info('Using remote graph from SPARQL endpoint `%s`', data_url) graph = PPGraph(SPARQLStore(data_url)) # early fail try: graph.query('''SELECT DISTINCT ?s WHERE { ?s rdf:type foaf:Person } LIMIT 1''') except Exception as e: L.error("Can't load data from remote endpoint") raise e return graph
r_precision = D(sum(retrived)) / len(retrived) avg_prec = D(0) if len(retrived) != 0: relevant_so_far = D(0) for i, is_relevant in enumerate(retrived, 1): if is_relevant: relevant_so_far += 1 avg_prec += relevant_so_far / i avg_prec /= len(retrived) return {'R-Precision': r_precision, 'AvgPrec': avg_prec} if __name__ == '__main__': L.setLevel('DEBUG') L.info('Running utils.py tests') # remote data_urls_to_test = [SPARQL_ENDPOINT] # one file local_files = glob('./pp_data/*.nq') if local_files: data_urls_to_test.append(local_files[0]) # all files in a directory data_urls_to_test.append('./pp_data/') test_ppgraph(data_urls_to_test) L.info('Passed')
def evaluation(graph: PPGraph, evaluation_data: str): samples = glob(path_join(evaluation_data, '*.yml')) # collect all entities entities_to_rank_unique: Set[URIRef] = set() for sample_file in samples: try: _, examples, entities_to_rank_part, _ = data_from_sample_file( sample_file) except SyntaxError: L.error('Error when loading data') return entities_to_rank_unique.update(examples) entities_to_rank_unique.update(entities_to_rank_part) entities_to_rank: List[URIRef] = list(entities_to_rank_unique) mean_stats: Dict[str, DefaultDict[str, D]] = { 'text': defaultdict(D), 'examples': defaultdict(D), 'combined': defaultdict(D) } mean_stats_denominator = {'text': 0, 'examples': 0, 'combined': 0} # do ranking for every sample file for sample_file in samples: print(f'Stats for `{sample_file}`:') try: topic, examples, _, relevant = data_from_sample_file(sample_file) except Exception as e: L.error('Error when loading data: %s', e) return entities_to_rank_wo_examples = entities_to_rank[:] for example in examples: if example in entities_to_rank_wo_examples: entities_to_rank_wo_examples.remove(example) ranking_text = rank_text_based(graph, (topic, examples), entities_to_rank_wo_examples) ranking_example = rank_examples_based(graph, (topic, examples), entities_to_rank_wo_examples) ranking_combined = rank_combined((ranking_text, ranking_example)) rankings = { 'text': ranking_text[1], 'examples': ranking_example[1], 'combined': ranking_combined[1] } # make the ranking for ranking_type in mean_stats.keys(): print(f' Ranking with `{ranking_type}-based` method') # how many top entities we would return in ideal case # paper sets this to 100 evaluation_limit = len(relevant) retrived: List[bool] = [] for i, (ranking_score, entity) in enumerate(rankings[ranking_type]): if i < evaluation_limit: if entity in relevant: retrived.append(True) print(f'OO {entity} - {ranking_score}') else: retrived.append(False) print(f'xx {entity} - {ranking_score}') else: break stats = statistical_stats(retrived) for k, v in stats.items(): print(f' {k} -> {v.quantize(D_PREC)}') mean_stats[ranking_type][k] += v mean_stats_denominator[ranking_type] += 1 print('Mean stats:') for ranking_type in mean_stats.keys(): print(f' Ranking with `{ranking_type}-based` method') for k, v in mean_stats[ranking_type].items(): print( f' Mean-{k} -> {(v / mean_stats_denominator[ranking_type]).quantize(D_PREC)}' )
print('Mean stats:') for ranking_type in mean_stats.keys(): print(f' Ranking with `{ranking_type}-based` method') for k, v in mean_stats[ranking_type].items(): print( f' Mean-{k} -> {(v / mean_stats_denominator[ranking_type]).quantize(D_PREC)}' ) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Evaluate ebes library.') parser.add_argument( 'evaluation_data', help='Path to directory with triple files (.nq) and sample files (.yml)' ) parser.add_argument("-v", "--verbose", help="debug output", action="store_true") args = parser.parse_args() L.setLevel('WARNING') if args.verbose: L.setLevel('DEBUG') print('Loading graphs...') graph = load_graph(args.evaluation_data) evaluation(graph, args.evaluation_data)