def calculate_overall_ranking(self, raw_queries, settings): api = API() mean_ap_whole = [] mean_ap_doc = [] queries = self.__raw_queries_to_queries(raw_queries) settings["mode"] = Mode.without_importance_to_sections settings_sec = copy.deepcopy(settings) settings_sec["mode"] = Mode.importance_to_sections for i, query in enumerate(queries): progressBar(i, len(queries)) ranked_papers_whole = api.get_papers({"whole-document": query["search_query"]}, settings) ranked_papers_sec = api.get_papers({query["imrad"]: query["search_query"]}, settings_sec) relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]] ap_whole = self.average_precision(ranked_papers_whole, relevant_paper) ap_doc = self.average_precision(ranked_papers_sec, relevant_paper) mean_ap_whole.append(ap_whole) mean_ap_doc.append(ap_doc) result_whole = sum(mean_ap_whole) / len(mean_ap_whole) result_doc = sum(mean_ap_doc) / len(mean_ap_doc) print() print("{} & {} & {}".format(Mode.without_importance_to_sections.name.replace("_", " "), len(mean_ap_whole), round(result_whole, 4))) print("{} & {} & {}".format(Mode.importance_to_sections.name.replace("_", " "), len(mean_ap_doc), round(result_doc, 4)))
def create_graph(): print('create graph') api = API() papers = api.get_all_paper() g = nx.Graph() g.clear() for paper in papers: references = [ x.get_paper_id() for x in paper.references if x.get_paper_id() ] for ref_id in references: g.add_edge(str(paper.id), str(ref_id)) degrees = [len(g.edges(node)) for node in g.nodes] for degree in degrees: if degree == 0: print("nope!") print("# nodes: ", g.number_of_nodes()) print("# edges: ", g.number_of_edges()) print("# components: ", len(list(nx.connected_components(g)))) print("max degree: ", max(degrees)) print("mean degree: ", round(mean(degrees), 4)) print("median degree: ", statistics.median(degrees)) print("diameter: ", nx.diameter(g), " (maximum eccentricity - max path)") print("periphery: ", len(nx.periphery(g)), " (# nodes eccentricity equal to the diameter)") create_degree_distribution(degrees, 'Degree Distribution', '#00365A', 13, 100, 3.5)
def admin_index(): if 'logged_in' in session.keys() and session['logged_in']: api = API() papers = api.get_all_paper() return render_template('admin/papers.html', papers=papers) else: return render_template('admin/index.html')
def __add_user(): print("Add new admin to the database") name = input("username: "******"Welcome on board {}".format(name))
def user_info(): if not ('logged_in' in session.keys() and session['logged_in']): return redirect('admin/') api = API() users = api.get_all_user() return render_template('admin/users.html', users=users)
def remove_duplicates_from_cited_by(): print("\nRemove Duplicates") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) paper.cited_by = list(dict.fromkeys(paper.cited_by)) api.client.update_paper(paper)
def paper_info(paper_id): api = API() papers = api.get_all_paper() id_to_filename = {paper.id: paper.filename for paper in papers} paper = api.get_paper(paper_id) return render_template('admin/paper_info.html', paper=paper, id_to_filename=id_to_filename)
def analize_chapters(): api = API() papers = api.get_all_paper() introduction, background, methods, result, discussion = {}, {}, {}, {}, {} print("# papers: ", len(papers)) for paper in papers: intro_titles = [ sec.heading_proceed for sec in paper.get_introduction() ] back_titles = [sec.heading_proceed for sec in paper.get_background()] methods_titles = [sec.heading_proceed for sec in paper.get_methods()] result_titles = [sec.heading_proceed for sec in paper.get_results()] discuss_titles = [ sec.heading_proceed for sec in paper.get_discussion() ] intro_word = __is_word_in_titles(intro_titles, ["introduct"]) back_word = __is_word_in_titles(back_titles, ["relat work", "background"]) methods_word = __is_word_in_titles(methods_titles, ["method", "approach", "model"]) results_word = __is_word_in_titles(result_titles, ["result", "experi", "evalu"]) discuss_word = __is_word_in_titles( discuss_titles, ["discuss", "conclus", "futur work"]) if intro_word: introduction[intro_word] = introduction[ intro_word] + 1 if intro_word in introduction else 1 if back_word: background[back_word] = background[ back_word] + 1 if back_word in background else 1 if methods_word: methods[methods_word] = methods[ methods_word] + 1 if methods_word in methods else 1 if results_word: result[results_word] = result[ results_word] + 1 if results_word in result else 1 if discuss_word: discussion[discuss_word] = discussion[ discuss_word] + 1 if discuss_word in discussion else 1 print("introduction:") print_imrad(introduction, len(papers)) print("related work:") print_imrad(background, len(papers)) print("methods:") print_imrad(methods, len(papers)) print("result:") print_imrad(result, len(papers)) print("discussion:") print_imrad(discussion, len(papers))
def print_circles(circles): api = API() tmp = [] for circle in circles: tmp_circle_array = [] for node in circle: tmp_circle_array.append(api.get_paper(node).filename) tmp.append(tmp_circle_array) print(tmp) print(circles)
def create_directed_graph(): print('\ncreate directed graph') api = API() papers = api.get_all_paper() dg = nx.DiGraph() dg.clear() for paper in papers: references = [ x.get_paper_id() for x in paper.references if x.get_paper_id() ] for ref_id in references: dg.add_edge(str(paper.id), str(ref_id)) # Data cleaning - if not done 5 papers which cite each other in dataset # preprints cited each other dg.remove_edge('5c52a9b9bf51c50be97c5145', '5c529cbdbf51c5359dce35f3') dg.remove_edge('5b0565406919df52a704f32c', '5b05673b6919df52a704f375') dg.remove_edge('5b97b226bf51c561194d9f1f', '5b05682a6919df52a704f395') dg.remove_edge('5c52a4f9bf51c50be97c5111', '5c533345bf51c5335baca21a') dg.remove_edge('5b97b0aebf51c561194d9f09', '5b97b31ebf51c561194d9f2a') print("# nodes: ", dg.number_of_nodes()) print("# edges: ", dg.number_of_edges()) print("# cycles: ", len(list(nx.simple_cycles(dg)))) print("# strongly connected components: ", len(list(nx.strongly_connected_components(dg)))) print("Dag longest path: ", len(nx.dag_longest_path(dg))) in_degrees = [] out_degrees = [] root_nodes = [] for node in dg.nodes: if len(dg.in_edges(node)) > 0: in_degrees.append(len(dg.in_edges(node))) if len(dg.out_edges(node)) > 0: out_degrees.append(len(dg.out_edges(node))) if len(dg.out_edges(node)) == 0: root_nodes.append(node) print("# root nodes: ", len(root_nodes)) print("In Degree:") print(" max degree: ", max(in_degrees)) print(" mean degree: ", round(mean(in_degrees), 4)) print(" median degree: ", statistics.median(in_degrees)) print("\nOut Degree:") print(" max degree: ", max(out_degrees)) print(" mean degree: ", round(mean(out_degrees), 4)) print(" median degree: ", statistics.median(out_degrees)) create_degree_distribution(in_degrees, 'In-Degree Distribution', '#33691e', 20, 100, 10) create_degree_distribution(out_degrees, 'Out-Degree Distribution', '#e65100')
def admin_login(): if request.method == 'GET': return redirect('admin/') api = API() if api.check_user_login(request.form['username'], request.form['password']): session['logged_in'] = True papers = api.get_all_paper() return render_template('admin/papers.html', papers=papers) else: return redirect('admin/')
def evaluate_ranking_time(paper, settings): print("Evaluate ", settings["algorithm"]) api = API() settings["mode"] = Mode.importance_to_sections settings["use-unclassified-chapters"] = True start = time.time() ranked_papers, queries = api.get_papers_with_paper(paper.filename, settings) end = time.time() elapsed_time = end - start print("Elapsed time: ", elapsed_time)
def test_importance_to_sections(self): api = API() papers = api.get_all_paper() settings = {"mode": Mode.importance_to_sections} queries = paper_to_queries(papers[0], settings) self.assertEqual(len(queries["whole-document"]), 0) self.assertGreater(len(queries[IMRaDType.INTRODUCTION.name]), 0) self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0) self.assertGreater(len(queries[IMRaDType.METHODS.name]), 0) self.assertGreater(len(queries[IMRaDType.RESULTS.name]), 0) self.assertGreater(len(queries[IMRaDType.DISCUSSION.name]), 0)
def test_import_intro_search_back(self): api = API() papers = api.get_all_paper() settings = { "mode": Mode.areas, "input-area": Area.Introduction, "search-area": Area.Background } queries = paper_to_queries(papers[0], settings) self.assertEqual(len(queries["whole-document"]), 0) self.assertEqual(len(queries[IMRaDType.INTRODUCTION.name]), 0) self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0) self.assertEqual(len(queries[IMRaDType.METHODS.name]), 0) self.assertEqual(len(queries[IMRaDType.RESULTS.name]), 0) self.assertEqual(len(queries[IMRaDType.DISCUSSION.name]), 0)
def __add_files(folder): api = API() text_file = open("newpapers.txt", "a+") for filename in os.listdir(os.path.abspath(folder)): print('CURRENT FILE: ' + str(filename)) if filename.endswith('.pdf'): src = folder + "/" + filename dst = UPLOAD_FOLDER + filename shutil.move(src, dst) try: paper = api.add_paper(filename) text_file.write(str(paper.id) + "\n") except (IOError, OSError, ClassificationError, DocumentTooLarge, PaperInStorage) as e: print(e) text_file.close()
def evaluate_query_time(num_papers): api = API() papers = api.get_all_paper()[:num_papers] settings = TFIDF.get_default_config() settings["mode"] = Mode.importance_to_sections settings["use-unclassified-chapters"] = True all_elapsed_times = [] for paper in papers: start = time.time() api.get_papers_with_paper(paper.filename, settings) end = time.time() elapsed_time = start - end print("Elapsed time: ", elapsed_time) all_elapsed_times.append(elapsed_time) print("Overall time: ", sum(all_elapsed_times)) print("Mean: ", mean(all_elapsed_times))
def check_references(): print("\nCheck References") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) other_papers = [p for p in papers if p.id != paper.id] for reference in paper.references: if not reference.get_paper_id(): continue ref_paper = api.get_paper(reference.get_paper_id()) if ref_paper.cited_by.count(paper.id) == 0: print() reference.paper_id = [] api.client.update_paper(paper) repair_corrupt_reference(reference, paper, other_papers, api)
def test_simple_ranking(self): queries = { IMRaDType.INTRODUCTION.name: "aaa", IMRaDType.BACKGROUND: "", IMRaDType.METHODS.name: "aaa bbb ccc ddd eee fff", IMRaDType.RESULTS.name: "", IMRaDType.DISCUSSION.name: "", "whole-document": "ggg aaa ccc" } settings = { **{ "importance_sections": False }, **TF.get_default_config() } api = API() ret = api.get_papers(queries, settings) self.assertGreater(len(ret), 0)
def check_cited_by(): print("\nCheck Cited by") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) for cited_paper_id in paper.cited_by: if not api.contains_paper(cited_paper_id): paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) continue cited_paper = api.get_paper(cited_paper_id) cited_paper_refs = [ref.get_paper_id() for ref in cited_paper.references if ref.get_paper_id()] if cited_paper_refs.count(paper.id) == 0: print() paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) link_references_to_paper(cited_paper, paper, api)
def __link_references_to_paper(): api = API() all_papers = api.get_all_paper() finished_files = [] if not os.path.isfile(REQ_DATA_PATH + "finished_papers.txt"): with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp: pickle.dump(finished_files, fp) with open(REQ_DATA_PATH + "finished_papers.txt", 'rb') as fp: finished_files = pickle.load(fp) if os.path.isfile("newpapers.txt"): papers = [] with open("newpapers.txt", 'r') as fp: for paper_id in fp: papers.append(api.get_paper(paper_id.rstrip())) else: papers = api.get_all_paper() i = 0 for paper in papers: i += 1 print("(", i, "/", len(papers), ")") if paper.id in finished_files: continue other_papers = [p for p in all_papers if p.id != paper.id] for other_paper in other_papers: if os.path.isfile("newpapers.txt"): link_references_to_paper(other_paper, paper, api) link_references_to_paper(paper, other_paper, api) finished_files.append(paper.id) with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp: pickle.dump(finished_files, fp)
def remove_link(paper_id): if not ('logged_in' in session.keys() and session['logged_in']): return redirect('admin/') api = API() api.remove__link_of_paper(paper_id, request.form['ref_paper_id']) papers = api.get_all_paper() id_to_filename = {paper.id: paper.filename for paper in papers} paper = api.get_paper(paper_id) return render_template('admin/paper_info.html', paper=paper, id_to_filename=id_to_filename)
def calculate_ranking_sections(self, raw_queries, settings): api = API() mean_ap_intro, mean_ap_background, mean_ap_methods, mean_ap_result, mean_ap_discussion = [], [], [], [], [] queries = self.__raw_queries_to_queries(raw_queries) for i, query in enumerate(queries): progressBar(i, len(queries)) relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]] ranked_papers_intro = api.get_papers({IMRaDType.INTRODUCTION.name: query["search_query"]}, settings) ranked_papers_background = api.get_papers({IMRaDType.BACKGROUND.name: query["search_query"]}, settings) ranked_papers_methods = api.get_papers({IMRaDType.METHODS.name: query["search_query"]}, settings) ranked_papers_result = api.get_papers({IMRaDType.RESULTS.name: query["search_query"]}, settings) ranked_papers_discussion = api.get_papers({IMRaDType.DISCUSSION.name: query["search_query"]}, settings) ap_intro = self.average_precision(ranked_papers_intro, relevant_paper) ap_background = self.average_precision(ranked_papers_background, relevant_paper) ap_methods = self.average_precision(ranked_papers_methods, relevant_paper) ap_result = self.average_precision(ranked_papers_result, relevant_paper) ap_discussion = self.average_precision(ranked_papers_discussion, relevant_paper) mean_ap_intro.append(ap_intro) mean_ap_background.append(ap_background) mean_ap_methods.append(ap_methods) mean_ap_result.append(ap_result) mean_ap_discussion.append(ap_discussion) print() print("{} & {} & {}".format(Mode.only_introduction.name.replace("_", " "), len(mean_ap_intro), sum(mean_ap_intro) / len(mean_ap_intro))) print("{} & {} & {}".format(Mode.only_background.name.replace("_", " "), len(mean_ap_background), sum(mean_ap_background) / len(mean_ap_background))) print("{} & {} & {}".format(Mode.only_methods.name.replace("_", " "), len(mean_ap_methods), sum(mean_ap_methods) / len(mean_ap_methods))) print("{} & {} & {}".format(Mode.only_results.name.replace("_", " "), len(mean_ap_result), sum(mean_ap_result) / len(mean_ap_result))) print("{} & {} & {}".format(Mode.only_discussion.name.replace("_", " "), len(mean_ap_discussion), sum(mean_ap_discussion) / len(mean_ap_discussion)))
# encoding: utf-8 import json import os from flask import Blueprint, render_template, request, current_app, send_file from engine.api import API from engine.datastore.ranking.ranked_boolean_retrieval import RankedBoolean from engine.datastore.ranking.tfidf import TFIDF from engine.datastore.models.section import IMRaDType backend = Blueprint('backend', __name__) api = API() @backend.route('/', methods=["GET", "POST"]) def index(): if request.method == "GET": return render_template('index.html', error=None, algorithm=api.get_ranking_algos()) queries = { "whole-document": request.form['whole_text'], IMRaDType.INTRODUCTION.name: request.form['intro_text'], IMRaDType.BACKGROUND.name: request.form['background_text'], IMRaDType.METHODS.name: request.form['methods_text'], IMRaDType.RESULTS.name: request.form['results_text'], IMRaDType.DISCUSSION.name: request.form['discussion_text'] }
print("Elapsed time: ", elapsed_time) def evaluate_query_time(num_papers): api = API() papers = api.get_all_paper()[:num_papers] settings = TFIDF.get_default_config() settings["mode"] = Mode.importance_to_sections settings["use-unclassified-chapters"] = True all_elapsed_times = [] for paper in papers: start = time.time() api.get_papers_with_paper(paper.filename, settings) end = time.time() elapsed_time = start - end print("Elapsed time: ", elapsed_time) all_elapsed_times.append(elapsed_time) print("Overall time: ", sum(all_elapsed_times)) print("Mean: ", mean(all_elapsed_times)) if __name__ == "__main__": # evaluate_query_time(10) api = API() papers = api.get_all_paper() evaluate_ranking_time(papers[0], RankedBoolean.get_default_config()) evaluate_ranking_time(papers[0], DivergenceFromRandomness.get_default_config())
def __init__(self): self.api = API()