Example #1
0
    def calculate_normalized_discounted_cumulative_gain_at_p(self, p=10, ranking_params=None):
        topic_doc_judgements = self.get_topic_assessments()
        params_id = ranking_params_to_params_id(ranking_params)
        result_files = self.get_result_files(params_id)

        ideal_rankings = {}
        qrels = {}
        for topic_id, judgements in topic_doc_judgements.items():
            ideal_rankings[topic_id] = sorted((judgement for judgement in judgements.values()), reverse=True)

            qrels[topic_id] = {'doc_id': [], 'rel': []}
            for doc_id, rel in judgements.items():
                qrels[topic_id]['doc_id'].append(doc_id)
                qrels[topic_id]['rel'].append(rel)
            qrels[topic_id] = pd.DataFrame(qrels[topic_id], columns=['doc_id', 'rel'])

        ndcgs = []
        for result_file in result_files:
            topic_id = self.path_to_topic_id(result_file)

            df = pd.read_csv(result_file, converters={'doc_id': lambda d: str(d)})
            df = df.merge(qrels[topic_id], on='doc_id', how='left')
            df.rel.fillna(value=0, inplace=True)

            dcg_p = dcg(df.rel, p)
            idcg_p = dcg(ideal_rankings[topic_id], p)
            ndcgs.append(safe_div(dcg_p, idcg_p))

        if params_id not in self.results:
            self.results[params_id] = {'ranking_params': ranking_params, 'metrics': {}}
        self.results[params_id]['metrics']['NDCG@%d' % p] = safe_div(sum(ndcgs), len(ndcgs))
Example #2
0
    def calculate_precision_at_n(self, n=10, ranking_params=None):
        params_id = ranking_params_to_params_id(ranking_params)
        result_files = self.get_result_files(params_id)

        o_eval_details_dir = os.path.join(self.o_assessments_path, params_id)
        if not os.path.exists(o_eval_details_dir):
            os.makedirs(o_eval_details_dir)
        o_eval_details_file = os.path.join(o_eval_details_dir, 'p_at_%d-precision_at_%d_per_topic.csv' % (n, n))

        with open(o_eval_details_file, 'w') as ef:
            writer = csv.writer(ef)
            writer.writerow(['topic_id', 'p_at_%d' % n])

            precisions_at_n = []
            for result_file in result_files:
                topic_id = self.path_to_topic_id(result_file)

                with open(result_file, 'r') as rf:
                    reader = csv.DictReader(rf)
                    results = []
                    for row in itertools.islice(reader, n):
                        results.append(row['relevant'] == 'True')

                    precision_at_n = results.count(True) / n
                    precisions_at_n.append(precision_at_n)
                    writer.writerow([topic_id, precision_at_n])

            if params_id not in self.results:
                self.results[params_id] = {'ranking_params': ranking_params, 'metrics': {}}
            self.results[params_id]['metrics']['P@%d' % n] = safe_div(sum(precisions_at_n), len(precisions_at_n))
Example #3
0
    def calculate_mean_average_precision(self, ranking_params=None):
        topic_doc_judgements = self.get_topic_assessments()
        params_id = ranking_params_to_params_id(ranking_params)
        result_files = self.get_result_files(params_id)

        o_eval_details_dir = os.path.join(self.o_assessments_path, params_id)
        if not os.path.exists(o_eval_details_dir):
            os.makedirs(o_eval_details_dir)
        o_eval_details_file = os.path.join(o_eval_details_dir, 'map_average_precision_per_topic.csv')

        with open(o_eval_details_file, 'w') as ef:
            writer = csv.writer(ef)
            writer.writerow(['topic_id', 'avg_precision'])

            num_rel_per_topic = {}
            for topic_id, judgements in topic_doc_judgements.items():
                num_rel_per_topic[topic_id] = 0
                for doc_id, rel in judgements.items():
                    if rel > 0:
                        num_rel_per_topic[topic_id] += 1

            avg_precisions = []
            for result_file in result_files:
                topic_id = self.path_to_topic_id(result_file)

                precisions = []
                with open(result_file, 'r') as rf:
                    reader = csv.DictReader(rf)
                    results = []
                    for row in reader:
                        results.append(row['relevant'] == 'True')

                    for i in range(1, len(results) + 1):
                        rel = results[0:i]
                        if not rel[i-1]:
                            continue
                        p = safe_div(sum(rel), len(rel))
                        precisions.append(p)

                    avg_precision = safe_div(sum(precisions), num_rel_per_topic[topic_id])
                    avg_precisions.append(avg_precision)
                    writer.writerow([topic_id, avg_precision])

            if params_id not in self.results:
                self.results[params_id] = {'ranking_params': ranking_params, 'metrics': {}}
            self.results[params_id]['metrics']['MAP'] = safe_div(sum(avg_precisions), len(avg_precisions))
            # This is an approximation of np.prod(avg_precision)**(1/len(avg_precision)) that works with zero values.
            self.results[params_id]['metrics']['GMAP'] = gmean(avg_precisions)
Example #4
0
    async def get_topic_results(self, ranking_params=None, topic_filter=None):
        topic_doc_judgements = self.get_topic_assessments()
        valid_ids = self.get_valid_ids()
        valid_categories_per_id = self.get_valid_categories_per_id()

        topics = etree.parse(self.task.topics_path)

        params_id = ranking_params_to_params_id(ranking_params)

        o_results_path = os.path.join(self.o_results_path, params_id)
        if not os.path.exists(o_results_path):
            os.makedirs(o_results_path)

        if self.retrieval_task == Index.RetrievalTask.entity_retrieval:
            xpath_topic = '//inex_topic'
            xpath_topic_id = '@topic_id'
        else:
            xpath_topic = '//topic'
            xpath_topic_id = '@id'

        if params_id not in self.stats:
            self.stats[params_id] = {
                'ranking_params': ranking_params,
                'query_time': {}
            }

        for topic in topics.xpath(xpath_topic):
            if self.interrupt:
                logger.warning("Evaluation task was interruped")
                break

            topic_id = get_first(topic.xpath(xpath_topic_id))

            if topic_id not in topic_doc_judgements:
                logger.warning(
                    "Skipping topic '%s', since it is not present in the assessments"
                    % topic_id)
                continue

            if topic_filter and topic_id not in topic_filter:
                logger.warning("Skipping topic '%s'" % topic_id)
                continue

            if self.retrieval_task == Index.RetrievalTask.entity_retrieval \
                    and self.query_type == Index.QueryType.entity:
                # Related Entity Finding / Entity List Completion
                query = '||'.join(topic.xpath('entities/entity/text()'))
            else:
                # Document Retrieval / Entity Retrieval
                query = get_first(topic.xpath('title/text()'))

            logger.info(
                "Obtaining results for query '%s' of topic '%s' using '%s' index at '%s'"
                % (query, topic_id, self.task.index_type,
                   self.task.index_location))
            start_time = time.time()
            engine_response = await self.index.search(
                query,
                0,
                10000,
                query_type=self.query_type,
                task=self.retrieval_task,
                base_index_location=self.task.base_index_location,
                base_index_type=self.task.base_index_type,
                ranking_function=self.task.ranking_function,
                ranking_params=ranking_params)
            end_time = int(round((time.time() - start_time) * 1000))
            self.stats[params_id]['query_time'][topic_id] = end_time

            results = engine_response['results']

            # Filtering by valid IDs (e.g., for some entities that are not explicitly a part of the collection)
            if valid_ids:
                logger.info(
                    "Filtering results with valid IDs (only %d IDs are valid)"
                    % len(valid_ids))
                results = [
                    result for result in results if result['id'] in valid_ids
                ]

            # Filtering by categories (only considers results with a category matching the query category)
            if valid_categories_per_id:
                logger.info(
                    "Filtering results by category (based on a dictionary with %d entries)"
                    % len(valid_categories_per_id))

                categories = set(topic.xpath('categories/category/text()'))
                results = [
                    result for result in results
                    if self.is_valid_categories(categories,
                                                valid_categories_per_id.get(
                                                    result['id'], []),
                                                match='substring')
                ]

            with open(os.path.join(o_results_path, '%s.csv' % topic_id),
                      'w',
                      newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['rank', 'score', 'doc_id', 'relevant'])
                for i, result in zip(range(1, len(results) + 1), results):
                    doc_id = result['id']
                    score = result['score']
                    relevant = topic_doc_judgements[topic_id][doc_id] > 0 \
                        if doc_id in topic_doc_judgements[topic_id] else False
                    writer.writerow([i, score, doc_id, relevant])

        self.stats[params_id]['total_query_time'] = sum(
            [t for t in self.stats[params_id]['query_time'].values()])
        self.stats[params_id]['avg_query_time'] = (
            self.stats[params_id]['total_query_time'] /
            len(self.stats[params_id]['query_time']))
Example #5
0
    async def get_topic_results(self, ranking_params=None, topic_filter=None):
        topic_doc_judgements = self.get_topic_assessments()

        data = open(self.task.topics_path, 'r').read()

        topics = re.findall(
            r'<top>.*?<num>.*?Number:.*?(\d+).*?<title>.*?([^<]+).*?</top>',
            data, re.MULTILINE | re.DOTALL)

        topics = [(topic_id.strip(), query.strip())
                  for topic_id, query in topics]

        params_id = ranking_params_to_params_id(ranking_params)

        o_results_path = os.path.join(self.o_results_path, params_id)
        if not os.path.exists(o_results_path):
            os.makedirs(o_results_path)

        if params_id not in self.stats:
            self.stats[params_id] = {
                'ranking_params': ranking_params,
                'query_time': {}
            }

        with open(os.path.join(o_results_path, '%s.res' % self.task.run_id),
                  'w') as trec_f:
            for topic_id, query in topics:
                if self.interrupt:
                    logger.warning("Evaluation task was interruped")
                    break

                if topic_id not in topic_doc_judgements:
                    logger.warning(
                        "Skipping topic '%s', since it is not present in the assessments"
                        % topic_id)
                    continue

                if topic_filter and topic_id not in topic_filter:
                    logger.warning("Skipping topic '%s'" % topic_id)
                    continue

                logger.info(
                    "Obtaining results for query '%s' of topic '%s' using '%s' index at '%s'"
                    % (query, topic_id, self.task.index_type,
                       self.task.index_location))
                start_time = time.time()
                engine_response = await self.index.search(
                    query,
                    0,
                    10000,
                    task=Index.RetrievalTask.document_retrieval,
                    base_index_location=self.task.base_index_location,
                    base_index_type=self.task.base_index_type,
                    ranking_function=self.task.ranking_function,
                    ranking_params=ranking_params)
                end_time = int(round((time.time() - start_time) * 1000))
                self.stats[params_id]['query_time'][topic_id] = end_time

                with open(os.path.join(o_results_path, '%s.csv' % topic_id),
                          'w',
                          newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow(['rank', 'score', 'doc_id', 'relevant'])
                    for i, result in zip(
                            range(1,
                                  len(engine_response['results']) + 1),
                            engine_response['results']):
                        doc_id = result['id']
                        score = result['score']
                        relevant = topic_doc_judgements[topic_id][doc_id] > 0 \
                            if doc_id in topic_doc_judgements[topic_id] else False
                        writer.writerow([i, score, doc_id, relevant])
                        trec_f.write(
                            "%s Q0 %s %s %s %s\n" %
                            (topic_id, doc_id, i, score, self.task.run_id))

        self.stats[params_id]['total_query_time'] = sum(
            [t for t in self.stats[params_id]['query_time'].values()])
        self.stats[params_id]['avg_query_time'] = (
            self.stats[params_id]['total_query_time'] /
            len(self.stats[params_id]['query_time']))
Example #6
0
    def calculate_precision_recall(self, ranking_params=None):
        # topic_id -> doc_id -> num_relevant_chars
        topic_doc_judgements = self.get_topic_assessments()

        params_id = ranking_params_to_params_id(ranking_params)
        result_files = self.get_result_files(params_id)

        o_eval_details_dir = os.path.join(self.o_assessments_path, params_id)
        if not os.path.exists(o_eval_details_dir):
            os.makedirs(o_eval_details_dir)
        o_eval_details_file = os.path.join(o_eval_details_dir, 'precision_recall_per_topic.csv')

        with open(o_eval_details_file, 'w') as ef:
            writer = csv.writer(ef)
            writer.writerow(['topic_id', 'tp', 'fp', 'tn', 'fn', 'precision', 'recall', 'f0.5', 'f1', 'f2'])

            tps = []
            fps = []
            tns = []
            fns = []
            precisions = []
            recalls = []
            f_0_5_scores = []
            f_1_scores = []
            f_2_scores = []

            for result_file in result_files:
                topic_id = self.path_to_topic_id(result_file)

                with open(result_file, 'r') as rf:
                    reader = csv.DictReader(rf)
                    result_doc_ids = set([row['doc_id'] for row in reader])

                    tp = fp = tn = fn = 0

                    # Positives, i.e., documents in the list of results.
                    for doc_id in result_doc_ids:
                        relevant = topic_doc_judgements.get(topic_id, {}).get(doc_id, 0) > 0
                        if relevant:
                            tp += 1
                        else:
                            fp += 1

                    # Negatives are unknown, because they refer to documents in the qrels that weren't returned.
                    for doc_id, judgment in topic_doc_judgements.get(topic_id, {}).items():
                        # Skip positives
                        if doc_id not in result_doc_ids:
                            relevant = judgment > 0
                            if relevant:
                                fn += 1
                            else:
                                tn += 1

                    # print(topic_id, "num_ret =", tp+fp, "num_rel =", tp+fn, "num_rel_ret =", tp)

                    tps.append(tp)
                    fps.append(fp)
                    tns.append(tn)
                    fns.append(fn)

                    logger.debug(
                        "%s - TP(%d) + FP(%d) + TN(%d) + FN(%d) = %d" % (topic_id, tp, fp, tn, fn, tp + fp + tn + fn))

                    precision = safe_div(tp, tp + fp)
                    precisions.append(precision)

                    recall = safe_div(tp, tp + fn)
                    recalls.append(recall)

                    f_0_5_score = self.f_score(precision, recall, beta=0.5)
                    f_0_5_scores.append(f_0_5_score)

                    f_1_score = self.f_score(precision, recall, beta=1)
                    f_1_scores.append(f_1_score)

                    f_2_score = self.f_score(precision, recall, beta=2)
                    f_2_scores.append(f_2_score)

                    writer.writerow([topic_id, tp, fp, tn, fn, precision, recall, f_0_5_score, f_1_score, f_2_score])

            if params_id not in self.results:
                self.results[params_id] = {'ranking_params': ranking_params, 'metrics': {}}
            self.results[params_id]['metrics']['Micro Avg Prec'] = safe_div(sum(tps), sum(tps) + sum(fps))
            self.results[params_id]['metrics']['Micro Avg Rec'] = safe_div(sum(tps), sum(tps) + sum(fns))
            self.results[params_id]['metrics']['Macro Avg Prec'] = safe_div(sum(precisions), len(precisions))
            self.results[params_id]['metrics']['Macro Avg Rec'] = safe_div(sum(recalls), len(recalls))

            self.results[params_id]['metrics']['Micro Avg F0_5'] = self.f_score(
                self.results[params_id]['metrics']['Micro Avg Prec'],
                self.results[params_id]['metrics']['Micro Avg Rec'], beta=0.5)
            self.results[params_id]['metrics']['Micro Avg F1'] = self.f_score(
                self.results[params_id]['metrics']['Micro Avg Prec'],
                self.results[params_id]['metrics']['Micro Avg Rec'], beta=1)
            self.results[params_id]['metrics']['Micro Avg F2'] = self.f_score(
                self.results[params_id]['metrics']['Micro Avg Prec'],
                self.results[params_id]['metrics']['Micro Avg Rec'], beta=2)

            self.results[params_id]['metrics']['Macro Avg F0_5'] = self.f_score(
                self.results[params_id]['metrics']['Macro Avg Prec'],
                self.results[params_id]['metrics']['Macro Avg Rec'], beta=0.5)
            self.results[params_id]['metrics']['Macro Avg F1'] = self.f_score(
                self.results[params_id]['metrics']['Macro Avg Prec'],
                self.results[params_id]['metrics']['Macro Avg Rec'], beta=1)
            self.results[params_id]['metrics']['Macro Avg F2'] = self.f_score(
                self.results[params_id]['metrics']['Macro Avg Prec'],
                self.results[params_id]['metrics']['Macro Avg Rec'], beta=2)

            # Same as TREC set_F.0.25 (beta^2 = 0.25 <=> 0.5^2 = 0.25), set_F.1 and set_F.4 (beta^2 = 4 <=> 2^2 = 4)
            self.results[params_id]['metrics']['F0_5'] = safe_div(sum(f_0_5_scores), len(f_0_5_scores))
            self.results[params_id]['metrics']['F1'] = safe_div(sum(f_1_scores), len(f_1_scores))
            self.results[params_id]['metrics']['F2'] = safe_div(sum(f_2_scores), len(f_2_scores))
Example #7
0
    def get_results_summary(self, headers, metrics, decimals, fmt):
        tasks = list(self.db['evaluation_tasks'].find(
            {'results': {
                '$exists': 1
            }}))
        if len(tasks) < 1:
            return

        with tempfile.NamedTemporaryFile() as tmp_file:
            columns = headers[:] if headers else []
            if metrics:
                columns.extend(metrics)
            df = pd.DataFrame(columns=columns)

            for task in tasks:
                task = EvaluationTask(**task)

                if not hasattr(task, 'results'):
                    continue

                for result in task.results.values():
                    values = [None] * len(headers)
                    if 'Run ID' in columns:
                        values[columns.index('Run ID')] = task.run_id

                    if 'Type' in columns:
                        values[columns.index('Type')] = task.index_type

                    if 'Parameters' in columns:
                        params_id = ranking_params_to_params_id(
                            result['ranking_params'])
                        values[columns.index('Parameters')] = params_id_to_str(
                            params_id)

                    if 'Location' in columns:
                        values[columns.index('Location')] = task.index_location

                    values.extend([
                        result['metrics'][metric]
                        if metric in result['metrics']
                        and result['metrics'][metric] != '' else np.nan
                        for metric in metrics
                    ])

                    df = df.append(pd.DataFrame([values], columns=columns))

            df.set_axis(axis=0, labels=range(len(df)), inplace=True)

            float_format_str = "%%.%df" % decimals

            def float_format(v):
                if type(v) in (np.float, np.float64, float):
                    return float_format_str % v
                return str(v)

            if fmt == 'csv':
                tmp_file.write(
                    df.to_csv(index=False,
                              float_format=float_format_str).encode('utf-8'))
            elif fmt == 'tex':
                for metric in metrics:
                    if metric not in df:
                        continue

                    max_idx = df[metric].idxmax()
                    df.loc[max_idx, metric] = '{\\bf %s}' % float_format(
                        df[metric][max_idx])
                    df.loc[~df.index.isin([max_idx]),
                           metric] = df.loc[~df.index.isin([max_idx]),
                                            metric].apply(float_format)

                tmp_file.write(
                    df.to_latex(index=False, escape=False).encode('utf-8'))
            elif fmt == 'html':
                for metric in metrics:
                    if metric not in df:
                        continue

                    max_idx = df[metric].idxmax()
                    df.loc[max_idx, metric] = '<b>%s</b>' % float_format(
                        df[metric][max_idx])
                    df.loc[~df.index.isin([max_idx]),
                           metric] = df.loc[~df.index.isin([max_idx]),
                                            metric].apply(float_format)

                if 'Parameters' in df:
                    df.Parameters = df.Parameters.apply(
                        lambda param_str: '<br>'.join(
                            s for s in param_str[1:-1].split(', '))
                        if param_str != 'No parameters' else param_str)

                tmp_file.write(
                    df.to_html(
                        index=False,
                        escape=False,
                        border=0,
                        justify='left',
                        classes='table table-sm table-scroll table-striped').
                    encode('utf-8'))

            yield tmp_file