def get_author_ranking_average_exact(query, index, k=10, tfidf=False):
    """
    (OBSOLETE) Create a dictionary with author id's mapped to their exact relevancy and their ranking
    in regard to the query. This uses the average paper representations per author.

    Parameters:
    query (string): The search query
    index (obj): The loaded FAISS index populated by paper embeddings
    k (int): The amount of authors to retrieve
    tfidf (boolean): Whether the tf-idf embeddings are used for retrieval instead of SBERT.

    Returns:
    ranking (dict): Mapping from authors to their query relevancy and rank.
    """
    query = query.lower()
    results = retrieve_results_average(query, index, k, tfidf=tfidf)
    candidate_authors = list(unique_everseen(results[0]))

    # We remove duplicate authors for now, while preserving order (their highest position)
    # authors = list(unique_everseen([get_first_author_by_id(str(rid))["id"] for rid in candidate_papers]))
    relevancies = [
        check_if_author_relevant(int(a), query) for a in candidate_authors
    ]

    ranking = {}

    for rank, (author,
               relevancy) in enumerate(zip(candidate_authors, relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking
Example #2
0
def prune_results_for_authors_wo_tags_average(results, query, how_many=10):
    """
    (NOT USED) Prunes the retrieved papers by filtering out all the papers whose authors
    either don't have tags in our dataset or just not in our dataset at all. This is needed
    to be able to perform evaluation. This uses the average paper representations per author.

    Parameters:
    query (string): The search query
    results (tuple): The retrieved papers and their distances
    how_many (int): The amount of papers we want to return

    Returns:
    pruned list of most similar papers to the query and the corresponding cosine distances
    """
    ids = results[0]
    distances = results[1]

    relevant_ids = []
    relevant_distances = []
    for aid, ad in zip(ids, distances):
        relevancy = check_if_author_relevant(int(aid), query)

        if relevancy != 'Not in the dataset or no tags present!':
            relevant_ids.append(aid)
            relevant_distances.append(ad)

    return relevant_ids[:how_many], relevant_distances[:how_many]
Example #3
0
def prune_results_for_authors_wo_tags(results, query, how_many=10):
    """
    (NOT USED) Prunes the retrieved papers by filtering out all the papers whose authors
    either don't have tags in our dataset or just not in our dataset at all. This is needed
    to be able to perform evaluation.

    Parameters:
    query (string): The search query
    results (tuple): The retrieved papers and their distances
    how_many (int): The amount of papers we want to return

    Returns:
    pruned list of most similar papers to the query and the corresponding cosine distances
    """
    ids = results[0]
    distances = results[1]

    relevant_ids = []
    relevant_distances = []
    # For now, I check if the first author is not in the set, I throw the paper away, because I now
    # only look at first author for evaluation. But later if I have another strategy for retrieving author per paper
    # we can change this logic back to "all authors not in the set".
    for rid, rd in zip(ids, distances):
        authors = [a["id"] for a in get_authors_by_id(str(rid))]
        relevancy = [check_if_author_relevant(int(a), query) for a in authors]
        if relevancy[0] != 'Not in the dataset or no tags present!':
            relevant_ids.append(rid)
            relevant_distances.append(rd)

    return relevant_ids[:how_many], relevant_distances[:how_many]
def get_author_ranking_exact_v2(query, index, k=10, tfidf=False, strategy="uniform",
                                normalized=False, norm_alpha=100, extra_term=10):
    """
    Produces an author ranking given a query and adds relevancy flag to the author
    based on the exact topic evaluation criteria. Used for evaluating the system.
    
    Parameters:
    query (string): The search query
    index (obj): The loaded FAISS index populated by paper embeddings
    k (int): The amount of authors to retrieve
    tfidf (bool): Whether the tf-idf embeddings are used for retrieval instead of SBERT.
    strategy (string): The data fusion strategy used for assigning author score per paper
    normalized (bool): Whether normalization should be applied to the scores, boosting less prolific
    authors and "punishing" highly prolific authors
    norm_alpha (int or float): The inverse strength of normalization (higher alpha means less normalization)
    extra_term (int): Extra normalization damping term, further reduces normalization effect
    
    Returns:
    ranking (dict): A mapping of authors to their retrieved rank and their 
    relevancy in relation to the query
    """
    if tfidf:
        i, d = get_most_similar_ids(query.lower(), index, 100, tfidf_clf)
    else:
        i, d = get_most_similar_ids(query.lower(), index, 100)

    author_score_dict = create_score_author_dict(query, i, d, strategy,
                                                 normalized=normalized, normalization_alpha=norm_alpha,
                                                 extra_normalization_term=extra_term)

    top_n = produce_authors_ranking(author_score_dict)[:k]

    relevancies = [check_if_author_relevant(int(aid), query) for aid, _ in top_n]

    ranking = {}

    for rank, (author, relevancy) in enumerate(zip([a[0] for a in top_n], relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking
def create_score_author_dict(query, retrieved_paper_ids, retrieved_distances, strategy="uniform", normalized=False,
                             average_pub_count=58,
                             normalization_alpha=1, extra_normalization_term=10):
    """
    Create a dictionary where each author gets a score in relation to the query. 
    The author ranking is assembled through a document-centric voting model process: 
    first, for each top retrieved paper, its score is assigned to each of the paper 
    authors following one of the data fusion strategies. Next, all the scores per author
    are aggregated into a mapping of authors to scores. Finally, a combination function (expCombSUM) 
    is applied to all author scores. These scores are returned per author in combination with the papers 
    that contributed to that score (for explainibility sake).
    
    Parameters:
    query (string): The search query
    retrieved_paper_ids (list): The papers that were retrieved from the FAISS index as 
    nearest neighbours for the query
    retrieved_distances (list): The distances from the query for each paper that were retrieved 
    from the FAISS index as nearest neighbours for the query
    strategy (string): The data fusion strategy used for assigning author score per paper
    normalized (bool): Whether normalization should be applied to the scores, boosting less prolific
    authors and "punishing" highly prolific authors
    average_pub_count (int): Average publication count for the authors in our dataset. Used for normalization
    normalization_alpha (int or float): The inverse strength of normalization (higher alpha means less normalization)
    extra_normalization_term (int): Extra normalization damping term, further reduces normalization effect
    
    
    Returns:
    authorship_scores (dict): A mapping between authors and their calculated score in relation to the query.
    """
    def expCombSUM(list_of_scores):
        return sum([math.exp(score) for score in list_of_scores])

    def normalize_score(score, l_pro, average_l=average_pub_count, alpha=normalization_alpha):
        normalized_score = score * math.log(1 + alpha * (average_l / (l_pro + extra_normalization_term)), 2)
        return normalized_score

    scores_per_author = defaultdict(list)
    reasons_per_author = defaultdict(list)
    for pi, score in zip(retrieved_paper_ids, retrieved_distances):
        # Prune only for author that exist in our data.
        authors = [item["id"] for item in get_authors_by_id(str(pi)) if
                   check_if_author_relevant(int(item["id"]), query) != 'Not in the dataset or no tags present!']
        if authors:
            if strategy == "uniform":
                score_per_author = score / len(authors)
                for author in authors:
                    if normalized:
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_per_author, pub_count)
                        scores_per_author[author].append(normalized_score)
                    else:
                        scores_per_author[author].append(score_per_author)
                    reasons_per_author[author].append({"paper": pi, "score": score})
            elif strategy == "binary":
                score_per_author = score
                for author in authors:
                    if normalized:
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_per_author, pub_count)
                        scores_per_author[author].append(normalized_score)
                    else:
                        scores_per_author[author].append(score_per_author)
                    reasons_per_author[author].append({"paper": pi, "score": score})
            elif strategy == "descending":
                decay_factor = 1
                for author in authors:
                    if normalized:
                        score_d = score * decay_factor
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_d, pub_count)
                        scores_per_author[author].append(normalized_score)
                        decay_factor -= 0.2
                    else:
                        scores_per_author[author].append(score * decay_factor)
                        decay_factor -= 0.2
                    reasons_per_author[author].append({"paper": pi, "score": score})
            elif strategy == "parabolic":
                #  TODO: here we did not yet write the normalization code because we do not run it for this config.
                decay_factor = 0.8
                scores_per_author[authors[0]].append(score)
                scores_per_author[authors[-1]].append(score)
                reasons_per_author[authors[0]].append({"paper": pi, "score": score})
                reasons_per_author[authors[-1]].append({"paper": pi, "score": score})
                for author in authors[1:-1]:
                    scores_per_author[author].append(score * decay_factor)
                    decay_factor -= 0.2
                    reasons_per_author[author] = {"paper": pi, "score": score}
                    reasons_per_author[author].append({"paper": pi, "score": score})
        else:
            continue

    authorship_scores = {k: {"score": expCombSUM(v),
                             "reasons": reasons_per_author[k]} for k, v in scores_per_author.items()}

    return authorship_scores