def calculate_overall_ranking(self, raw_queries, settings):
        api = API()
        mean_ap_whole = []
        mean_ap_doc = []

        queries = self.__raw_queries_to_queries(raw_queries)
        settings["mode"] = Mode.without_importance_to_sections
        settings_sec = copy.deepcopy(settings)
        settings_sec["mode"] = Mode.importance_to_sections

        for i, query in enumerate(queries):
            progressBar(i, len(queries))
            ranked_papers_whole = api.get_papers({"whole-document": query["search_query"]}, settings)
            ranked_papers_sec = api.get_papers({query["imrad"]: query["search_query"]}, settings_sec)

            relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]]

            ap_whole = self.average_precision(ranked_papers_whole, relevant_paper)
            ap_doc = self.average_precision(ranked_papers_sec, relevant_paper)

            mean_ap_whole.append(ap_whole)
            mean_ap_doc.append(ap_doc)

        result_whole = sum(mean_ap_whole) / len(mean_ap_whole)
        result_doc = sum(mean_ap_doc) / len(mean_ap_doc)
        print()
        print("{} & {} & {}".format(Mode.without_importance_to_sections.name.replace("_", " "), len(mean_ap_whole),
                                    round(result_whole, 4)))
        print("{} & {} & {}".format(Mode.importance_to_sections.name.replace("_", " "), len(mean_ap_doc),
                                    round(result_doc, 4)))
def create_graph():
    print('create graph')
    api = API()
    papers = api.get_all_paper()
    g = nx.Graph()
    g.clear()

    for paper in papers:
        references = [
            x.get_paper_id() for x in paper.references if x.get_paper_id()
        ]

        for ref_id in references:
            g.add_edge(str(paper.id), str(ref_id))

    degrees = [len(g.edges(node)) for node in g.nodes]

    for degree in degrees:
        if degree == 0:
            print("nope!")

    print("# nodes: ", g.number_of_nodes())
    print("# edges: ", g.number_of_edges())
    print("# components: ", len(list(nx.connected_components(g))))
    print("max degree: ", max(degrees))
    print("mean degree: ", round(mean(degrees), 4))
    print("median degree: ", statistics.median(degrees))
    print("diameter: ", nx.diameter(g), " (maximum eccentricity - max path)")
    print("periphery: ", len(nx.periphery(g)),
          " (# nodes eccentricity equal to the diameter)")
    create_degree_distribution(degrees, 'Degree Distribution', '#00365A', 13,
                               100, 3.5)
def admin_index():
    if 'logged_in' in session.keys() and session['logged_in']:
        api = API()
        papers = api.get_all_paper()
        return render_template('admin/papers.html', papers=papers)
    else:
        return render_template('admin/index.html')
Beispiel #4
0
def __add_user():
    print("Add new admin to the database")
    name = input("username: "******"Welcome on board {}".format(name))
def user_info():
    if not ('logged_in' in session.keys() and session['logged_in']):
        return redirect('admin/')

    api = API()
    users = api.get_all_user()
    return render_template('admin/users.html', users=users)
Beispiel #6
0
def remove_duplicates_from_cited_by():
    print("\nRemove Duplicates")
    api = API()
    papers = api.get_all_paper()

    for i, paper in enumerate(papers):
        progressBar(i, len(papers))
        paper.cited_by = list(dict.fromkeys(paper.cited_by))
        api.client.update_paper(paper)
def paper_info(paper_id):
    api = API()
    papers = api.get_all_paper()
    id_to_filename = {paper.id: paper.filename for paper in papers}
    paper = api.get_paper(paper_id)

    return render_template('admin/paper_info.html',
                           paper=paper,
                           id_to_filename=id_to_filename)
def analize_chapters():
    api = API()
    papers = api.get_all_paper()
    introduction, background, methods, result, discussion = {}, {}, {}, {}, {}
    print("# papers: ", len(papers))
    for paper in papers:
        intro_titles = [
            sec.heading_proceed for sec in paper.get_introduction()
        ]
        back_titles = [sec.heading_proceed for sec in paper.get_background()]
        methods_titles = [sec.heading_proceed for sec in paper.get_methods()]
        result_titles = [sec.heading_proceed for sec in paper.get_results()]
        discuss_titles = [
            sec.heading_proceed for sec in paper.get_discussion()
        ]

        intro_word = __is_word_in_titles(intro_titles, ["introduct"])
        back_word = __is_word_in_titles(back_titles,
                                        ["relat work", "background"])
        methods_word = __is_word_in_titles(methods_titles,
                                           ["method", "approach", "model"])
        results_word = __is_word_in_titles(result_titles,
                                           ["result", "experi", "evalu"])
        discuss_word = __is_word_in_titles(
            discuss_titles, ["discuss", "conclus", "futur work"])

        if intro_word:
            introduction[intro_word] = introduction[
                intro_word] + 1 if intro_word in introduction else 1

        if back_word:
            background[back_word] = background[
                back_word] + 1 if back_word in background else 1

        if methods_word:
            methods[methods_word] = methods[
                methods_word] + 1 if methods_word in methods else 1

        if results_word:
            result[results_word] = result[
                results_word] + 1 if results_word in result else 1

        if discuss_word:
            discussion[discuss_word] = discussion[
                discuss_word] + 1 if discuss_word in discussion else 1

    print("introduction:")
    print_imrad(introduction, len(papers))
    print("related work:")
    print_imrad(background, len(papers))
    print("methods:")
    print_imrad(methods, len(papers))
    print("result:")
    print_imrad(result, len(papers))
    print("discussion:")
    print_imrad(discussion, len(papers))
def print_circles(circles):
    api = API()
    tmp = []
    for circle in circles:
        tmp_circle_array = []
        for node in circle:
            tmp_circle_array.append(api.get_paper(node).filename)
        tmp.append(tmp_circle_array)
    print(tmp)
    print(circles)
def create_directed_graph():
    print('\ncreate directed graph')
    api = API()
    papers = api.get_all_paper()
    dg = nx.DiGraph()
    dg.clear()

    for paper in papers:
        references = [
            x.get_paper_id() for x in paper.references if x.get_paper_id()
        ]

        for ref_id in references:
            dg.add_edge(str(paper.id), str(ref_id))

    # Data cleaning - if not done 5 papers which cite each other in dataset
    # preprints cited each other
    dg.remove_edge('5c52a9b9bf51c50be97c5145', '5c529cbdbf51c5359dce35f3')
    dg.remove_edge('5b0565406919df52a704f32c', '5b05673b6919df52a704f375')
    dg.remove_edge('5b97b226bf51c561194d9f1f', '5b05682a6919df52a704f395')
    dg.remove_edge('5c52a4f9bf51c50be97c5111', '5c533345bf51c5335baca21a')
    dg.remove_edge('5b97b0aebf51c561194d9f09', '5b97b31ebf51c561194d9f2a')
    print("# nodes: ", dg.number_of_nodes())
    print("# edges: ", dg.number_of_edges())
    print("# cycles: ", len(list(nx.simple_cycles(dg))))
    print("# strongly connected components: ",
          len(list(nx.strongly_connected_components(dg))))
    print("Dag longest path: ", len(nx.dag_longest_path(dg)))

    in_degrees = []
    out_degrees = []
    root_nodes = []
    for node in dg.nodes:
        if len(dg.in_edges(node)) > 0:
            in_degrees.append(len(dg.in_edges(node)))

        if len(dg.out_edges(node)) > 0:
            out_degrees.append(len(dg.out_edges(node)))

        if len(dg.out_edges(node)) == 0:
            root_nodes.append(node)

    print("# root nodes: ", len(root_nodes))
    print("In Degree:")
    print("  max degree: ", max(in_degrees))
    print("  mean degree: ", round(mean(in_degrees), 4))
    print("  median degree: ", statistics.median(in_degrees))
    print("\nOut Degree:")
    print("  max degree: ", max(out_degrees))
    print("  mean degree: ", round(mean(out_degrees), 4))
    print("  median degree: ", statistics.median(out_degrees))
    create_degree_distribution(in_degrees, 'In-Degree Distribution', '#33691e',
                               20, 100, 10)
    create_degree_distribution(out_degrees, 'Out-Degree Distribution',
                               '#e65100')
def admin_login():
    if request.method == 'GET':
        return redirect('admin/')
    api = API()
    if api.check_user_login(request.form['username'],
                            request.form['password']):
        session['logged_in'] = True
        papers = api.get_all_paper()
        return render_template('admin/papers.html', papers=papers)
    else:
        return redirect('admin/')
Beispiel #12
0
def evaluate_ranking_time(paper, settings):
    print("Evaluate ", settings["algorithm"])
    api = API()
    settings["mode"] = Mode.importance_to_sections
    settings["use-unclassified-chapters"] = True
    start = time.time()
    ranked_papers, queries = api.get_papers_with_paper(paper.filename,
                                                       settings)
    end = time.time()
    elapsed_time = end - start
    print("Elapsed time: ", elapsed_time)
    def test_importance_to_sections(self):
        api = API()
        papers = api.get_all_paper()
        settings = {"mode": Mode.importance_to_sections}

        queries = paper_to_queries(papers[0], settings)
        self.assertEqual(len(queries["whole-document"]), 0)
        self.assertGreater(len(queries[IMRaDType.INTRODUCTION.name]), 0)
        self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0)
        self.assertGreater(len(queries[IMRaDType.METHODS.name]), 0)
        self.assertGreater(len(queries[IMRaDType.RESULTS.name]), 0)
        self.assertGreater(len(queries[IMRaDType.DISCUSSION.name]), 0)
    def test_import_intro_search_back(self):
        api = API()
        papers = api.get_all_paper()
        settings = {
            "mode": Mode.areas,
            "input-area": Area.Introduction,
            "search-area": Area.Background
        }

        queries = paper_to_queries(papers[0], settings)
        self.assertEqual(len(queries["whole-document"]), 0)
        self.assertEqual(len(queries[IMRaDType.INTRODUCTION.name]), 0)
        self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0)
        self.assertEqual(len(queries[IMRaDType.METHODS.name]), 0)
        self.assertEqual(len(queries[IMRaDType.RESULTS.name]), 0)
        self.assertEqual(len(queries[IMRaDType.DISCUSSION.name]), 0)
Beispiel #15
0
def __add_files(folder):
    api = API()
    text_file = open("newpapers.txt", "a+")

    for filename in os.listdir(os.path.abspath(folder)):
        print('CURRENT FILE: ' + str(filename))

        if filename.endswith('.pdf'):
            src = folder + "/" + filename
            dst = UPLOAD_FOLDER + filename
            shutil.move(src, dst)
            try:
                paper = api.add_paper(filename)
                text_file.write(str(paper.id) + "\n")
            except (IOError, OSError, ClassificationError, DocumentTooLarge,
                    PaperInStorage) as e:
                print(e)
    text_file.close()
Beispiel #16
0
def evaluate_query_time(num_papers):
    api = API()
    papers = api.get_all_paper()[:num_papers]
    settings = TFIDF.get_default_config()
    settings["mode"] = Mode.importance_to_sections
    settings["use-unclassified-chapters"] = True

    all_elapsed_times = []
    for paper in papers:
        start = time.time()
        api.get_papers_with_paper(paper.filename, settings)
        end = time.time()
        elapsed_time = start - end
        print("Elapsed time: ", elapsed_time)
        all_elapsed_times.append(elapsed_time)

    print("Overall time: ", sum(all_elapsed_times))
    print("Mean: ", mean(all_elapsed_times))
Beispiel #17
0
def check_references():
    print("\nCheck References")
    api = API()
    papers = api.get_all_paper()

    for i, paper in enumerate(papers):
        progressBar(i, len(papers))

        other_papers = [p for p in papers if p.id != paper.id]
        for reference in paper.references:
            if not reference.get_paper_id():
                continue

            ref_paper = api.get_paper(reference.get_paper_id())
            if ref_paper.cited_by.count(paper.id) == 0:
                print()
                reference.paper_id = []
                api.client.update_paper(paper)
                repair_corrupt_reference(reference, paper, other_papers, api)
    def test_simple_ranking(self):
        queries = {
            IMRaDType.INTRODUCTION.name: "aaa",
            IMRaDType.BACKGROUND: "",
            IMRaDType.METHODS.name: "aaa bbb ccc ddd eee fff",
            IMRaDType.RESULTS.name: "",
            IMRaDType.DISCUSSION.name: "",
            "whole-document": "ggg aaa ccc"
        }

        settings = {
            **{
                "importance_sections": False
            },
            **TF.get_default_config()
        }

        api = API()
        ret = api.get_papers(queries, settings)

        self.assertGreater(len(ret), 0)
Beispiel #19
0
def check_cited_by():
    print("\nCheck Cited by")
    api = API()
    papers = api.get_all_paper()

    for i, paper in enumerate(papers):
        progressBar(i, len(papers))
        for cited_paper_id in paper.cited_by:
            if not api.contains_paper(cited_paper_id):
                paper.cited_by.remove(cited_paper_id)
                api.client.update_paper(paper)
                continue

            cited_paper = api.get_paper(cited_paper_id)
            cited_paper_refs = [ref.get_paper_id() for ref in cited_paper.references if ref.get_paper_id()]

            if cited_paper_refs.count(paper.id) == 0:
                print()
                paper.cited_by.remove(cited_paper_id)
                api.client.update_paper(paper)
                link_references_to_paper(cited_paper, paper, api)
Beispiel #20
0
def __link_references_to_paper():
    api = API()
    all_papers = api.get_all_paper()

    finished_files = []
    if not os.path.isfile(REQ_DATA_PATH + "finished_papers.txt"):
        with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp:
            pickle.dump(finished_files, fp)

    with open(REQ_DATA_PATH + "finished_papers.txt", 'rb') as fp:
        finished_files = pickle.load(fp)

    if os.path.isfile("newpapers.txt"):
        papers = []
        with open("newpapers.txt", 'r') as fp:
            for paper_id in fp:
                papers.append(api.get_paper(paper_id.rstrip()))
    else:
        papers = api.get_all_paper()

    i = 0
    for paper in papers:
        i += 1
        print("(", i, "/", len(papers), ")")

        if paper.id in finished_files:
            continue

        other_papers = [p for p in all_papers if p.id != paper.id]
        for other_paper in other_papers:
            if os.path.isfile("newpapers.txt"):
                link_references_to_paper(other_paper, paper, api)

            link_references_to_paper(paper, other_paper, api)

        finished_files.append(paper.id)
        with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp:
            pickle.dump(finished_files, fp)
def remove_link(paper_id):
    if not ('logged_in' in session.keys() and session['logged_in']):
        return redirect('admin/')

    api = API()
    api.remove__link_of_paper(paper_id, request.form['ref_paper_id'])

    papers = api.get_all_paper()
    id_to_filename = {paper.id: paper.filename for paper in papers}
    paper = api.get_paper(paper_id)
    return render_template('admin/paper_info.html',
                           paper=paper,
                           id_to_filename=id_to_filename)
    def calculate_ranking_sections(self, raw_queries, settings):
        api = API()
        mean_ap_intro, mean_ap_background, mean_ap_methods, mean_ap_result, mean_ap_discussion = [], [], [], [], []

        queries = self.__raw_queries_to_queries(raw_queries)

        for i, query in enumerate(queries):
            progressBar(i, len(queries))
            relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]]

            ranked_papers_intro = api.get_papers({IMRaDType.INTRODUCTION.name: query["search_query"]}, settings)
            ranked_papers_background = api.get_papers({IMRaDType.BACKGROUND.name: query["search_query"]}, settings)
            ranked_papers_methods = api.get_papers({IMRaDType.METHODS.name: query["search_query"]}, settings)
            ranked_papers_result = api.get_papers({IMRaDType.RESULTS.name: query["search_query"]}, settings)
            ranked_papers_discussion = api.get_papers({IMRaDType.DISCUSSION.name: query["search_query"]}, settings)

            ap_intro = self.average_precision(ranked_papers_intro, relevant_paper)
            ap_background = self.average_precision(ranked_papers_background, relevant_paper)
            ap_methods = self.average_precision(ranked_papers_methods, relevant_paper)
            ap_result = self.average_precision(ranked_papers_result, relevant_paper)
            ap_discussion = self.average_precision(ranked_papers_discussion, relevant_paper)

            mean_ap_intro.append(ap_intro)
            mean_ap_background.append(ap_background)
            mean_ap_methods.append(ap_methods)
            mean_ap_result.append(ap_result)
            mean_ap_discussion.append(ap_discussion)

        print()
        print("{} & {} & {}".format(Mode.only_introduction.name.replace("_", " "),
                                    len(mean_ap_intro), sum(mean_ap_intro) / len(mean_ap_intro)))
        print("{} & {} & {}".format(Mode.only_background.name.replace("_", " "),
                                    len(mean_ap_background), sum(mean_ap_background) / len(mean_ap_background)))
        print("{} & {} & {}".format(Mode.only_methods.name.replace("_", " "),
                                    len(mean_ap_methods), sum(mean_ap_methods) / len(mean_ap_methods)))
        print("{} & {} & {}".format(Mode.only_results.name.replace("_", " "),
                                    len(mean_ap_result), sum(mean_ap_result) / len(mean_ap_result)))
        print("{} & {} & {}".format(Mode.only_discussion.name.replace("_", " "),
                                    len(mean_ap_discussion), sum(mean_ap_discussion) / len(mean_ap_discussion)))
Beispiel #23
0
# encoding: utf-8
import json
import os

from flask import Blueprint, render_template, request, current_app, send_file

from engine.api import API
from engine.datastore.ranking.ranked_boolean_retrieval import RankedBoolean
from engine.datastore.ranking.tfidf import TFIDF
from engine.datastore.models.section import IMRaDType

backend = Blueprint('backend', __name__)
api = API()


@backend.route('/', methods=["GET", "POST"])
def index():
    if request.method == "GET":
        return render_template('index.html',
                               error=None,
                               algorithm=api.get_ranking_algos())

    queries = {
        "whole-document": request.form['whole_text'],
        IMRaDType.INTRODUCTION.name: request.form['intro_text'],
        IMRaDType.BACKGROUND.name: request.form['background_text'],
        IMRaDType.METHODS.name: request.form['methods_text'],
        IMRaDType.RESULTS.name: request.form['results_text'],
        IMRaDType.DISCUSSION.name: request.form['discussion_text']
    }
Beispiel #24
0
    print("Elapsed time: ", elapsed_time)


def evaluate_query_time(num_papers):
    api = API()
    papers = api.get_all_paper()[:num_papers]
    settings = TFIDF.get_default_config()
    settings["mode"] = Mode.importance_to_sections
    settings["use-unclassified-chapters"] = True

    all_elapsed_times = []
    for paper in papers:
        start = time.time()
        api.get_papers_with_paper(paper.filename, settings)
        end = time.time()
        elapsed_time = start - end
        print("Elapsed time: ", elapsed_time)
        all_elapsed_times.append(elapsed_time)

    print("Overall time: ", sum(all_elapsed_times))
    print("Mean: ", mean(all_elapsed_times))


if __name__ == "__main__":
    # evaluate_query_time(10)
    api = API()
    papers = api.get_all_paper()
    evaluate_ranking_time(papers[0], RankedBoolean.get_default_config())
    evaluate_ranking_time(papers[0],
                          DivergenceFromRandomness.get_default_config())
 def __init__(self):
     self.api = API()