Exemple #1
0
def rank_combined(rankings: Tuple[Ranking, Ranking]) -> Ranking:
    lambda_param = D('0.5')
    delta_param = D('0.1')

    combined_ranking: DefaultDict[URIRef, D] = defaultdict(D)

    ranking_text, ranking_example = rankings
    ap_example, ranking_example_data = ranking_example
    ap_text, ranking_text_data = ranking_text
    overlap = min(ap_example, ap_text) / max(ap_example, ap_text)

    L.info("Overlap = %s", overlap)

    if overlap < delta_param and ap_example > ap_text:
        return ap_example, ranking_example_data

    elif overlap < delta_param and ap_example < ap_text:
        return ap_text, ranking_text_data

    else:
        for v, entity in ranking_example_data:
            combined_ranking[entity] += v * lambda_param

        for v, entity in ranking_text_data:
            combined_ranking[entity] += v * (1 - lambda_param)

        return D(1), [(v, k) for k, v in sorted(
            combined_ranking.items(), key=lambda item: item[1], reverse=True)]
Exemple #2
0
def _example_retrieval_model(preparsed_data: Dict[Triple, D], graph: PPGraph,
                             entity: URIRef):
    """Rates entity represented as set of triples.

    Rate is based on the similarity of sets.

    Args:
        preparsed_data: preparsed example entities
        graph: RDF triples to use (graph represents whole word we know about)
        entity: RDF entity to rank

    Returns:
        Probability
    """
    L.debug('Computing example-based probability for %s', entity)

    # sanity checks
    assert isinstance(graph, PPGraph), 'graph is not PPGraph'
    assert isinstance(entity, URIRef), ['entity is not URIRef', entity]

    # get set representations of the entity, e_l
    representation = _triples_set_representation(graph, entity)

    # P(e_l | theta_X) = sum(tr in X) P(e_l|tr) * P(tr|theta_X)
    # P(e_l|tr) = 1 if tr in e_l else 0
    # P(tr|theta_X) are in preparsed_data
    final_probability = D(0)
    for tr in preparsed_data.keys():
        if tr in representation:
            final_probability += preparsed_data[tr]

    L.debug('Probability: %s', final_probability)
    return final_probability
Exemple #3
0
def test_ppgraph(data_urls: List[str]):
    for data_url in data_urls:
        L.info('Test with %s', data_url)
        graph = load_data(data_url)

        # can query
        results = graph.query(PREFIXES + '''SELECT DISTINCT ?s 
                   WHERE { 
                      ?s rdf:type foaf:Person
                   } LIMIT 10''')
        assert len(results) == 10

        # triples
        t_subject = URIRef('http://dbpedia.org/resource/John_Markoff')
        t_predicate = RDF.type
        t_object = URIRef('http://dbpedia.org/class/yago/LivingThing100004258')
        for s, p, o in graph.triples((t_subject, t_predicate, t_object)):
            assert s == t_subject
            assert p == t_predicate
            assert o == t_object

        # subject/predicate/object helpers
        for p, o in graph.predicate_objects((t_subject)):
            assert isinstance(p, URIRef) or isinstance(
                p, Literal) or isinstance(p, BNode)
            assert isinstance(o, URIRef) or isinstance(
                o, Literal) or isinstance(o, BNode)
Exemple #4
0
 def parse(self, *args, **kwargs):
     if isinstance(self.store, SPARQLStore):
         L.warning(
             'Switching PPGraph backend from remote endpoint to local files'
         )
         self.store = ConjunctiveGraph()
     self._size = None  # will need to recompute that
     return self.store.parse(*args, **kwargs)
def get_and_store_data(sparql_endpoint: str, out_filename: str, entities: List[URIRef]):
    """Query remote endpoint for triples and save them in a local file.

    For every entity in the list dump triples like:
        entity -> w/e -> Literal
        entity -> w/e -> URI
        URI -> label -> Literal
        w/e -> w/e -> entity
    """
    entities_amount = len(entities)
    L.info('Getting data from remote endpoint "%s" for %d entities',
           sparql_endpoint, entities_amount)

    # load the endpoint
    try:
        graph = load_data(sparql_endpoint)
    except Exception as e:
        L.error('Error when loading data from `%s`: %s', sparql_endpoint, e)
        return

    # get triples for every entity in the list
    for i, entity in enumerate(entities):
        L.info('%d / %d', i, entities_amount)
        result = []

        # entity -> w/e -> w/e
        for triple_predicate, triple_object in graph.predicate_objects(
                subject=entity):
            # skip blank nodes
            if isinstance(triple_object, BNode):
                continue

            tr = (entity, triple_predicate, triple_object)
            result.append(tr)

            # URI -> label -> Literal
            if isinstance(triple_object, URIRef):
                try:
                    label = graph.label(triple_object)
                    if label:
                        result.append((triple_object, RDFS.label, label))
                except:
                    pass

        # w/e -> w/e -> entity
        for triple_subject, triple_predicate in graph.subject_predicates(
                object=entity):
            # skip blank nodes
            if isinstance(triple_subject, BNode):
                continue

            result.append((triple_subject, triple_predicate, entity))

        L.debug('Saving %d triples', len(result))
        with open(out_filename, 'a', encoding='utf8') as f:
            for tr in result:

                f.write(' '.join(map(n3_format, tr)))
                f.write(' <http://dbpedia.org/>')
                f.write(' .\n')
Exemple #6
0
def _examples_preparsing(graph: PPGraph, input_data: Query) -> Dict[Triple, D]:
    """Convert example entities to frequency (number of occurences).

    Most of the final probability depends only on examples.
    So we need to compute it only once (not for every entity to rank).
    """
    # unpack query
    _, examples = input_data

    L.debug('Preparsing data for %d examples', len(examples))

    # get set representations
    examples_representations = []
    for example in examples:
        examples_representations.append(
            _triples_set_representation(graph, example))

    # n(tr, x) = 1 if tr in x else 0
    # denominator of P(tr|theta_X) = denominator = sum(tr in all(x in X)) sum(x in X) n(tr, x)
    denominator = D(0)
    for example_representation in examples_representations:
        for tr in example_representation:
            for x in examples_representations:
                if tr in x:
                    denominator += 1
    L.debug('Denominator: %s', denominator)

    # P(e_l | theta_X) = sum(tr in X) P(e_l|tr) * P(tr|theta_X)
    # P(tr|theta_X) = sum(x in X) n(tr, x) / dem
    # we can precompute P(tr|theta_X)
    preparsed_examples = dict()
    for example_representation in examples_representations:
        for tr in example_representation:
            nominator = D(0)
            for x in examples_representations:
                if tr in x:
                    nominator += 1
            preparsed_examples[tr] = nominator / denominator

    L.debug('-' * 20)
    return preparsed_examples
Exemple #7
0
def _triples_set_representation(graph: PPGraph, entity: URIRef) -> Set[Triple]:
    """Creates set representation of the entity.

    Set contains all triples that have the entity as a subject (outlinks)
    or an object (inlinks).

    Args:
        graph: RDF triples to use (graph represents whole word we know about)
        entity: RDF entity to rank

    Returns:
        set of RDF triples
    """
    L.debug('Computing triples set representation of %s', entity)

    # sanity checks
    assert isinstance(graph, PPGraph), ['graph is not PPGraph', graph]
    assert isinstance(entity, URIRef), ['entity is not URIRef', entity]

    result = set()

    # outlinks
    for triple_predicate, triple_object in graph.predicate_objects(
            subject=entity):
        if isinstance(triple_object, Literal):
            result.add((None, triple_predicate, triple_object))
        elif isinstance(triple_object, URIRef):
            result.add((entity, triple_predicate, triple_object))
    outlinks = len(result)
    L.debug('%s-> outlinks: %s', ' ' * 4, outlinks)

    # inlinks
    for triple_subject, triple_predicate in graph.subject_predicates(
            object=entity):
        result.add((triple_subject, triple_predicate, entity))
    L.debug('%s-> inlinks: %s', ' ' * 4, len(result) - outlinks)

    return result
Exemple #8
0
def rank(input_data: Query, preparsing_function: PreparsingFunc, retrieval_model: RetrievalModel, graph: PPGraph, entities_to_rank: List[URIRef]) \
        -> Ranking:
    """Rates entities based on provided model and input query.

    Args:
        input_data: query, it is passed to preparsing_function
        preparsing_function: function that takes input_data and returns stuff for retrieval_model
        retrieval_model: function implementing rating
        graph: RDF triples to use
        entities_to_rank: list of entities that should be rated

    Returns:
        Ordered/sorted list containing tuples: (rate, entity),
        best matching entities comes first
    """
    _, examples = input_data
    entities_to_rank_amount = len(entities_to_rank)
    entities_to_rank_progress = max(1, entities_to_rank_amount // 10)
    L.info('Ranking %d entities', entities_to_rank_amount)

    # preparse before the loop for efficiency
    preparsed_data = preparsing_function(graph, input_data)
    ranking_score: D

    # do the ranking
    ranking: List[Tuple[D, URIRef]] = []
    for i, entity in enumerate(entities_to_rank):
        if i % entities_to_rank_progress == 0:
            L.info(' ~> ranking entity no %d / %d', i, entities_to_rank_amount)

        # score entity
        ranking_score = retrieval_model(preparsed_data, graph, entity)

        # insert and sort
        bisect.insort_right(ranking, (ranking_score, entity))
        L.debug('-' * 20)

    # min/max normalization + best scored first
    max_val = ranking[-1][0]
    min_val = ranking[0][0]
    norm_denominator = max_val - min_val

    # rank examples themselves, for future use in combined approach
    ranking_with_examples = ranking[:]
    for i, entity in enumerate(examples):
        example_ranking = retrieval_model(preparsed_data, graph, entity)
        bisect.insort_right(ranking_with_examples, (example_ranking, entity))
    ranking_with_examples = ranking_with_examples[::-1]

    retrived_with_examples: List[bool] = []
    count_found_examples = 0
    for _, entity in ranking_with_examples:
        # assumed amount of relevant entities
        if count_found_examples == 10:
            break

        if entity in examples:
            count_found_examples += 1
            retrived_with_examples.append(True)
        else:
            retrived_with_examples.append(False)

    # average precision
    ap = statistical_stats(retrived_with_examples)['AvgPrec']

    L.info(" ~> normalization min = %s, max = %s", min_val, max_val)
    L.info(" ~> AP = %s", ap)
    return ap, [((v - min_val) / norm_denominator, entity)
                for v, entity in ranking[::-1]]
Exemple #9
0
def _text_representation(graph: PPGraph,
                         entity: URIRef) -> Dict[str, DefaultDict[str, int]]:
    """Creates text representation of the entity.

    Entity is represented with triples that have the entity as a subject. Such triples
    are then divided into:
        - attributes: with literal objects
        - types: with 'type' predicates like /subject or /22-rdf-syntax-ns#type
        - links: all other
    Finally all URIs are expanded to text with /rdfs:label predicate.

    Args:
        graph(PPGraph)
        entity(URIRef)

    Returns:
        dict with keys: attributes, types, links
        values are lists of literals and rdf labels, as strings
    """
    L.debug('Computing text representation of %s', entity)

    # sanity checks
    assert isinstance(graph, PPGraph), ['graph is not PPGraph', graph]
    assert isinstance(entity, URIRef), ['entity is not URIRef', entity]

    # use this URIs for types
    type_uris = (RDF.type,
                 URIRef('http://www.w3.org/2004/02/skos/core#subject'),
                 URIRef('http://purl.org/dc/elements/1.1/subject'))

    # store triples in sets
    attributes: DefaultDict[str, int] = defaultdict(int)
    types: DefaultDict[str, int] = defaultdict(int)
    links: DefaultDict[str, int] = defaultdict(int)
    entities_without_label = 0

    # require only `threshold` objects of all type
    threshold = 999

    # iterate over all triples with the entity as the subject
    for triple_predicate, triple_object in graph.predicate_objects(entity):
        cs_to_use = None
        value_to_use = None

        if isinstance(triple_object, Literal):
            cs_to_use = attributes
            value_to_use = triple_object

        elif isinstance(triple_object, URIRef):
            value_to_use = graph.label(triple_object)
            if not value_to_use or len(value_to_use) == 0:
                entities_without_label += 1
                continue

            if triple_predicate in type_uris:
                cs_to_use = types
            else:
                cs_to_use = links

        else:
            continue

        for o in normalize_relation(value_to_use).split():
            cs_to_use[o] += 1

        if all([
                sum(cs.values()) >= threshold
                for cs in [attributes, types, links]
        ]):
            break

    result = {'attributes': attributes, 'types': types, 'links': links}
    if entities_without_label > 0:
        L.debug('%d skipped, because of missing label', entities_without_label)
    L.debug(
        'Found: %s, %s, %s', *[
            ' '.join([str(sum(cs.values())), 'terms in', cs_name])
            for cs_name, cs in result.items()
        ])
    return result
Exemple #10
0
def _text_retrieval_model(preparsed_data: Tuple[List[str], int],
                          graph: PPGraph, entity: URIRef) -> D:
    """Rates entity represented as text.

    Rate is equal to the probability of the entity being relevant to the relation.
    Probability formula is based on a language modeling approach.

    Dirichlet model computation is based on:
        http://mlwiki.org/index.php/Smoothing_for_Language_Models#Dirichlet_Prior_Smoothing
        https://www.coursera.org/lecture/text-retrieval/lesson-4-6-smoothing-methods-part-1-kM6Ie
        http://ciir.cs.umass.edu/pubfiles/ir-445.pdf
        (4) http://profsite.um.ac.ir/~monsefi/machine-learning/pdf/Machine-Learning-Tom-Mitchell.pdf

    Args:
        preparsed_data: preparsed relation, precomputed dirichlet parameters
        graph: RDF triples to use (graph represents whole word we know about)
        entity: RDF entity to rank

    Returns:
        Probability
    """
    L.debug('Computing text-based probability for %s', entity)

    # sanity checks
    assert isinstance(graph, PPGraph), 'graph is not PPGraph'
    assert isinstance(entity, URIRef), ['entity is not URIRef', entity]

    # unpack input data
    relation, ni = preparsed_data

    # get text representations of the entity, theta_e
    representations = _text_representation(graph, entity)

    # precompute number of terms
    representations_lengths = {
        cs_name: sum(cs.values())
        for cs_name, cs in representations.items()
    }

    # denominator of "Dirichlet smoothed model of the entire collection of triples"
    # P(t|theta_c) == sum(D in theta_c)tf(t,D) / sum(D in theta_c)|D|
    # D = node text representation
    # theta_c = collection of nodes
    # we do not compute that, too time consuming
    #
    # probability_collection_denominator = D(0)
    # for node in graph.all_nodes():
    #     if isinstance(node, Literal):
    #         triple_object_text = node
    #     else:
    #         triple_object_text = graph.label(node)
    #     probability_collection_denominator += len(normalize_relation(triple_object_text))

    # P(t|theta_c), it should depends on term t
    # but assume it is 1/ni, according to (4), page 182
    probability_collection = D(1) / D(ni)

    # this are experimental
    representation_weights = {
        'attributes': D('0.4'),
        'types': D('0.4'),
        'links': D('0.2')
    }

    # P(R | theta_e) == product(t in R) P(t | theta_w_e)
    final_probability = D('1.0')
    for t in relation:
        L.debug('%s-> processing term %s', ' ' * 4, repr(t))

        # P(t | theta_w_e) == sum(cs in representations) P(t | theta_cs_e) * P(cs)
        term_probability = D('0.0')
        for cs_name, cs in representations.items():
            # tf(t,e) is the term frequency of t in the representation document of e
            # http://mlwiki.org/index.php/TF-IDF#Term_Frequency
            tf = cs[t]

            # "Dirichlet smoothed model of the entire collection of triples"
            # P(t|theta_c) == sum(D in theta_c)tf(t,D) / sum(D in theta_c)|D|
            # we do not compute that, too time consuming
            #
            # probability_collection_nominator = D(0)
            # for node in graph.all_nodes():
            #     if isinstance(node, Literal):
            #         triple_object_text = node
            #     else:
            #         triple_object_text = graph.label(node)
            #     probability_collection_nominator += normalize_relation(triple_object_text).count(t)
            # probability_collection = probability_collection_nominator / probability_collection_denominator

            # P(t | theta_cs_e) == [tf(t,e) + ni*P(t|theta_c)] / [|e| + ni]
            representation_probability = D(tf + ni * probability_collection)
            representation_probability /= representations_lengths[cs_name] + ni
            L.debug('%s-> probability for %s: %s (tf=%d, |e|=%d)', ' ' * 8,
                    cs_name, representation_probability.quantize(D_PREC), tf,
                    representations_lengths[cs_name])

            # do the addition
            term_probability += representation_probability * \
                representation_weights[cs_name]

        L.debug('%s-> term probability: %s', ' ' * 8,
                term_probability.quantize(D_PREC))

        # do the multiplication
        final_probability *= term_probability

    L.debug('Probability: %s', final_probability)
    return final_probability
def main():
    # cmd line args
    parser = argparse.ArgumentParser(description='Capture data from \
        remote SPARQL endpoint and save it to local file in nqads format')
    parser.add_argument('filename', help='File to save data in')
    parser.add_argument(
        'sample_file',
        help='YAML file with entities as list of URIs under `sample_key` key')
    parser.add_argument(
        'sample_key', default='relevant',
        help='YAML key for entities list')
    parser.add_argument(
        '-e',
        '--endpoint',
        dest='sparql_endpoint',
        default=SPARQL_ENDPOINT,
        help='SPARQL endpoint url')
    parser.add_argument("-v", "--verbose", help="debug output",
                        action="store_true")

    # args parsing and sanity checks
    args = parser.parse_args()

    L.setLevel('INFO')
    if args.verbose:
        L.setLevel('DEBUG')

    if isfile(args.filename):
        L.warning('File `%s` exists, will append to it!', args.filename)

    if not isfile(args.sample_file):
        L.error('File `%s` do not exists, aborting!', args.sample_file)
        exit(1)

    try:
        with open(args.sample_file, 'r', encoding='utf8') as f:
            sample_data = safe_load(f)
    except YAMLError as e:
        L.error('Error loading sample file `%s`: %s', args.sample_file, e)
        exit(1)

    if not isinstance(sample_data, dict):
        L.error('Sample data must be dictionary!')
        exit(1)

    if args.sample_key not in sample_data.keys():
        L.error('`%s` key not found in sample data', args.sample_key)
        exit(1)

    # do the job
    entities: List[URIRef] = list(map(URIRef, sample_data[args.sample_key]))
    get_and_store_data(args.sparql_endpoint, args.filename, entities)
Exemple #12
0
def data_from_sample_file(sample_file: str) -> \
        Tuple[str, List[URIRef], List[URIRef], List[URIRef]]:
    """Parses sample file"""
    L.info('Preparing ranking for sample file `%s`', sample_file)

    if not isfile(sample_file):
        L.error('File `%s` do not exists, aborting!', sample_file)
        raise SyntaxError

    try:
        with open(sample_file, 'r', encoding='utf8') as f:
            sample_data = safe_load(f)
    except (UnicodeDecodeError, YAMLError) as e:
        L.error('Error loading sample file `%s`: %s', sample_file, e)
        raise SyntaxError

    if not isinstance(sample_data, dict):
        L.error('Sample data must be dictionary!')
        raise SyntaxError

    for required_key in ['topic', 'relevant', 'not_relevant']:
        if required_key not in sample_data.keys():
            L.error('`%s` key not found in sample data', required_key)
            raise SyntaxError

    # convert strings to URIRefs and prepare data
    relevant = list(map(URIRef, sample_data['relevant']))
    not_relevant = list(map(URIRef, sample_data['not_relevant']))

    if len(relevant) == 0:
        L.error('No relevant entities specified in the sample data')
        raise SyntaxError

    examples_amount = EXAMPLES_AMOUNT
    random_examples = True
    if 'examples' in sample_data:
        try:
            examples_amount = int(sample_data['examples'])
            random_examples = False
            L.info('Using top %d entities as examples', examples_amount)
        except Exception as e:
            L.error('Error reading amount of examples from YAML file: %s', e)

    if len(relevant) <= examples_amount:
        L.warning(
            'There is only %d relevant entities in sample data `%s`, trimming amount of examples',
            len(relevant), sample_file)

    # select random examples from relevant entities
    if random_examples:
        shuffle(relevant)

    # prepare entities
    examples = relevant[:examples_amount]
    relevant = relevant[examples_amount:]
    entities_to_rank = relevant[:] + not_relevant[:]

    return sample_data['topic'], examples, entities_to_rank, relevant
Exemple #13
0
def load_data(data_url: str, old_graph: Optional[PPGraph] = None) -> PPGraph:
    """Create new PPGraph or add triples to the provided one.

    Args:
        data_url: path to RDF file or url address of SPARQL endpoint,
                    passing an url will invalidate old_graph
        old_graph: existing graph, will add triples to it

    Returns:
        Graph with triples loaded from data_url (lazy loaded in case of SPARQL endpoint)
    """
    if old_graph:
        graph = old_graph
    else:
        graph = PPGraph(ConjunctiveGraph())

    if isfile(data_url):
        L.info('Loading triples from file `%s`', data_url)
        data_format = guess_format(data_url)
        graph.parse(data_url, format=data_format)

    elif isdir(data_url):
        L.info('Loading triples from files in directory `%s`', data_url)
        for extension in TRIPLE_FILE_EXTENSIONS:
            triples_files = glob(f'{data_url}/*.{extension}')
            if len(triples_files) > 0:
                L.info('Found %d `.%s` files', len(triples_files), extension)

            for i, triples_file in enumerate(triples_files):
                data_format = guess_format(triples_file)
                L.debug('%d / %d (`%s`), data format: %s', i,
                        len(triples_files), triples_file, data_format)
                graph.parse(triples_file, format=data_format)

    else:
        L.info('Using remote graph from SPARQL endpoint `%s`', data_url)
        graph = PPGraph(SPARQLStore(data_url))

        # early fail
        try:
            graph.query('''SELECT DISTINCT ?s 
                   WHERE { 
                      ?s rdf:type foaf:Person
                   } LIMIT 1''')
        except Exception as e:
            L.error("Can't load data from remote endpoint")
            raise e

    return graph
Exemple #14
0
    r_precision = D(sum(retrived)) / len(retrived)
    avg_prec = D(0)

    if len(retrived) != 0:
        relevant_so_far = D(0)
        for i, is_relevant in enumerate(retrived, 1):
            if is_relevant:
                relevant_so_far += 1
                avg_prec += relevant_so_far / i
        avg_prec /= len(retrived)

    return {'R-Precision': r_precision, 'AvgPrec': avg_prec}


if __name__ == '__main__':
    L.setLevel('DEBUG')
    L.info('Running utils.py tests')

    # remote
    data_urls_to_test = [SPARQL_ENDPOINT]

    # one file
    local_files = glob('./pp_data/*.nq')
    if local_files:
        data_urls_to_test.append(local_files[0])

    # all files in a directory
    data_urls_to_test.append('./pp_data/')

    test_ppgraph(data_urls_to_test)
    L.info('Passed')
Exemple #15
0
def evaluation(graph: PPGraph, evaluation_data: str):
    samples = glob(path_join(evaluation_data, '*.yml'))

    # collect all entities
    entities_to_rank_unique: Set[URIRef] = set()
    for sample_file in samples:
        try:
            _, examples, entities_to_rank_part, _ = data_from_sample_file(
                sample_file)
        except SyntaxError:
            L.error('Error when loading data')
            return

        entities_to_rank_unique.update(examples)
        entities_to_rank_unique.update(entities_to_rank_part)

    entities_to_rank: List[URIRef] = list(entities_to_rank_unique)
    mean_stats: Dict[str, DefaultDict[str, D]] = {
        'text': defaultdict(D),
        'examples': defaultdict(D),
        'combined': defaultdict(D)
    }
    mean_stats_denominator = {'text': 0, 'examples': 0, 'combined': 0}

    # do ranking for every sample file
    for sample_file in samples:
        print(f'Stats for `{sample_file}`:')
        try:
            topic, examples, _, relevant = data_from_sample_file(sample_file)
        except Exception as e:
            L.error('Error when loading data: %s', e)
            return

        entities_to_rank_wo_examples = entities_to_rank[:]
        for example in examples:
            if example in entities_to_rank_wo_examples:
                entities_to_rank_wo_examples.remove(example)

        ranking_text = rank_text_based(graph, (topic, examples),
                                       entities_to_rank_wo_examples)
        ranking_example = rank_examples_based(graph, (topic, examples),
                                              entities_to_rank_wo_examples)
        ranking_combined = rank_combined((ranking_text, ranking_example))
        rankings = {
            'text': ranking_text[1],
            'examples': ranking_example[1],
            'combined': ranking_combined[1]
        }

        # make the ranking
        for ranking_type in mean_stats.keys():
            print(f'  Ranking with `{ranking_type}-based` method')

            # how many top entities we would return in ideal case
            # paper sets this to 100
            evaluation_limit = len(relevant)
            retrived: List[bool] = []
            for i, (ranking_score,
                    entity) in enumerate(rankings[ranking_type]):
                if i < evaluation_limit:
                    if entity in relevant:
                        retrived.append(True)
                        print(f'OO {entity} - {ranking_score}')
                    else:
                        retrived.append(False)
                        print(f'xx {entity} - {ranking_score}')
                else:
                    break

            stats = statistical_stats(retrived)
            for k, v in stats.items():
                print(f'    {k} -> {v.quantize(D_PREC)}')
                mean_stats[ranking_type][k] += v
            mean_stats_denominator[ranking_type] += 1

    print('Mean stats:')
    for ranking_type in mean_stats.keys():
        print(f'  Ranking with `{ranking_type}-based` method')
        for k, v in mean_stats[ranking_type].items():
            print(
                f'    Mean-{k} -> {(v / mean_stats_denominator[ranking_type]).quantize(D_PREC)}'
            )
Exemple #16
0
    print('Mean stats:')
    for ranking_type in mean_stats.keys():
        print(f'  Ranking with `{ranking_type}-based` method')
        for k, v in mean_stats[ranking_type].items():
            print(
                f'    Mean-{k} -> {(v / mean_stats_denominator[ranking_type]).quantize(D_PREC)}'
            )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Evaluate ebes library.')
    parser.add_argument(
        'evaluation_data',
        help='Path to directory with triple files (.nq) and sample files (.yml)'
    )
    parser.add_argument("-v",
                        "--verbose",
                        help="debug output",
                        action="store_true")

    args = parser.parse_args()

    L.setLevel('WARNING')
    if args.verbose:
        L.setLevel('DEBUG')

    print('Loading graphs...')
    graph = load_graph(args.evaluation_data)

    evaluation(graph, args.evaluation_data)