def __calculate_avg_doc_length(self):
        papers = self.get_all_paper()
        intro, background, methods, result, discussion, overall = [], [], [], [], [], []
        for paper in papers:
            overall.append(paper.word_hist.number_of_words())
            intro.append(
                sections_to_word_hist(
                    paper.get_introduction()).number_of_words())
            background.append(
                sections_to_word_hist(
                    paper.get_background()).number_of_words())
            methods.append(
                sections_to_word_hist(paper.get_methods()).number_of_words())
            result.append(
                sections_to_word_hist(paper.get_results()).number_of_words())
            discussion.append(
                sections_to_word_hist(
                    paper.get_discussion()).number_of_words())

        return {
            "whole-document": mean(overall, True),
            IMRaDType.INTRODUCTION.name: mean(intro, True),
            IMRaDType.BACKGROUND.name: mean(background, True),
            IMRaDType.METHODS.name: mean(methods, True),
            IMRaDType.RESULTS.name: mean(result, True),
            IMRaDType.DISCUSSION.name: mean(discussion, True)
        }
Example #2
0
    def get_ranking(paper, queries, settings, api):
        tfidf = {}
        df = settings["df"]
        N = settings["N"]

        for imrad, query in queries.items():
            if imrad == "whole-document":
                hist = paper.word_hist
            else:
                sections = paper.get_sections_with_imrad_type(imrad)
                hist = sections_to_word_hist(sections)

            key_values = {}
            for querie_word in query.split():
                df_val = df[imrad][querie_word]

                # query word is in no other paper -> protect against dividing by 0
                if not df_val:
                    continue

                tf_val = hist.get_tf(querie_word)
                idf_val = math.log10(N / df_val)
                tfidf_val = tf_val * idf_val

                key_values[querie_word] = {"tfidf": tfidf_val, "tf": tf_val, "idf": idf_val}

            tfidf[imrad] = {"sumwords": sum(hist.values()), "keys": key_values,
                            "score": sum([val["tfidf"] for val in key_values.values()])}

        return sum([rating["score"] for rating in tfidf.values()]), tfidf
 def __calculate_all_papers_hist(self):
     papers = self.get_all_paper()
     intro, background, methods, result, discussion, overall = WordHist(
     ), WordHist(), WordHist(), WordHist(), WordHist(), WordHist()
     for paper in papers:
         overall.append(paper.get_combined_word_hist())
         intro.append(sections_to_word_hist(paper.get_introduction()))
         background.append(sections_to_word_hist(paper.get_background()))
         methods.append(sections_to_word_hist(paper.get_methods()))
         result.append(sections_to_word_hist(paper.get_results()))
         discussion.append(sections_to_word_hist(paper.get_discussion()))
     return {
         "whole-document": overall,
         IMRaDType.INTRODUCTION.name: intro,
         IMRaDType.BACKGROUND.name: background,
         IMRaDType.METHODS.name: methods,
         IMRaDType.RESULTS.name: result,
         IMRaDType.DISCUSSION.name: discussion
     }
def get_ignoring_keys(paper, queries, importance_sections):
    if not importance_sections:
        return paper.word_hist.query_to_keys(queries["whole-document"])

    ignored, sections = [], []
    for imrad, query in queries.items():
        if imrad == "whole-document":
            continue
        sections = paper.get_sections_with_imrad_type(imrad)
        ignored.extend(sections_to_word_hist(sections).query_to_keys(query))
    return ignored
    def get_ranking(paper, queries, settings, api):
        dfr = {}

        avg_doc_length_dict = api.client.get_avg_doc_length()
        all_papers_hist_dict = api.client.get_all_papers_hist()
        N = api.client.get_number_of_papers()

        for imrad, query in queries.items():
            if imrad == "whole-document":
                hist = paper.word_hist
            else:
                sections = paper.get_sections_with_imrad_type(imrad)
                hist = sections_to_word_hist(sections)

            # + 1... avoid dividing by zero
            doc_length = hist.number_of_words() + 1
            avg_doc_length = avg_doc_length_dict[imrad]
            all_papers_hist = all_papers_hist_dict[imrad]

            key_values = {}
            for querie_word in query.split():
                # f_ij == term frequency
                f_ij_norm = hist.get_tf(querie_word) * (avg_doc_length /
                                                        doc_length)

                if f_ij_norm == 0 or querie_word not in all_papers_hist:
                    continue

                pi_i = all_papers_hist[querie_word] / N
                p_kic = f_ij_norm * math.log10(f_ij_norm / pi_i) + (
                    pi_i +
                    (1 / (12 * f_ij_norm + 1)) - f_ij_norm) * math.log10(
                        math.e) + 0.5 * math.log(2 * math.pi * f_ij_norm)
                p_kidi = 1 / (f_ij_norm + 1)

                w_ij = p_kic * p_kidi
                f_iq = query.count(querie_word)
                rank = f_iq * w_ij

                key_values[querie_word] = {
                    "rank": rank,
                    "f_iq": f_iq,
                    "f_ij_norm": f_ij_norm,
                    "pi_i": pi_i,
                    "p_kic": p_kic,
                    "p_kidi": p_kidi
                }
            dfr[imrad] = {
                "sumwords": sum(hist.values()),
                "keys": key_values,
                "score": sum([val["rank"] for val in key_values.values()])
            }

        return sum([rating["score"] for rating in dfr.values()]), dfr
    def __create_hists(queries, papers):
        hists = {}
        for imrad, query in queries.items():
            hists[imrad] = {}
            for paper in papers:
                if imrad == "whole-document":
                    hist = paper.word_hist
                else:
                    sections = paper.get_sections_with_imrad_type(imrad)
                    hist = sections_to_word_hist(sections)

                hists[imrad][paper.id] = hist
        return hists
Example #7
0
    def get_ranking(paper, queries, settings, api):
        k1 = copy.deepcopy(settings.get("k1"))
        b = copy.deepcopy(settings.get("b"))

        bm25 = {}
        df = settings["df"]
        N = settings["N"]
        avg_doc_length_dict = api.client.get_avg_doc_length()

        for imrad, query in queries.items():
            if imrad == "whole-document":
                hist = paper.word_hist
            else:
                sections = paper.get_sections_with_imrad_type(imrad)
                hist = sections_to_word_hist(sections)

            doc_length = hist.number_of_words() + 1
            avg_doc_length = avg_doc_length_dict[imrad]

            key_values = {}
            for querie_word in query.split():
                df_val = df[imrad][querie_word]

                # query word is in no other paper -> protect against dividing by 0
                if not df_val:
                    continue

                tf_val = hist.get_tf(querie_word)
                idf_val = math.log10((N - df_val + 0.5) / (df_val + 0.5))
                b25_val = idf_val * ((tf_val * (k1 + 1)) / (tf_val + k1 * (1 - b + b * (doc_length / avg_doc_length))))

                key_values[querie_word] = {"bm25": b25_val, "tf": tf_val, "idf": idf_val}

            bm25[imrad] = {"sumwords": sum(hist.values()), "keys": key_values,
                           "score": sum([val["bm25"] for val in key_values.values()])}

        return sum([rating["score"] for rating in bm25.values()]), bm25
Example #8
0
    def get_ranking(paper, queries, settings, api):
        info = {}
        word_value = []
        ranking = 0.0

        for imrad_type, query in queries.items():
            if imrad_type == "whole-document":
                hist = paper.word_hist
            else:
                sections = paper.get_sections_with_imrad_type(imrad_type)
                hist = sections_to_word_hist(sections)

            for word in query.split():
                rank = hist.get_tf(word)
                ranking += rank
                word_value.append([word, rank])

            info[imrad_type] = {
                "rank": ranking,
                "sumwords": sum(hist.values()),
                "keyvalues": word_value
            }

        return sum([ranking["rank"] for ranking in info.values()]), info