def admin_index(): if 'logged_in' in session.keys() and session['logged_in']: api = API() papers = api.get_all_paper() return render_template('admin/papers.html', papers=papers) else: return render_template('admin/index.html')
def create_graph(): print('create graph') api = API() papers = api.get_all_paper() g = nx.Graph() g.clear() for paper in papers: references = [ x.get_paper_id() for x in paper.references if x.get_paper_id() ] for ref_id in references: g.add_edge(str(paper.id), str(ref_id)) degrees = [len(g.edges(node)) for node in g.nodes] for degree in degrees: if degree == 0: print("nope!") print("# nodes: ", g.number_of_nodes()) print("# edges: ", g.number_of_edges()) print("# components: ", len(list(nx.connected_components(g)))) print("max degree: ", max(degrees)) print("mean degree: ", round(mean(degrees), 4)) print("median degree: ", statistics.median(degrees)) print("diameter: ", nx.diameter(g), " (maximum eccentricity - max path)") print("periphery: ", len(nx.periphery(g)), " (# nodes eccentricity equal to the diameter)") create_degree_distribution(degrees, 'Degree Distribution', '#00365A', 13, 100, 3.5)
def remove_duplicates_from_cited_by(): print("\nRemove Duplicates") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) paper.cited_by = list(dict.fromkeys(paper.cited_by)) api.client.update_paper(paper)
def paper_info(paper_id): api = API() papers = api.get_all_paper() id_to_filename = {paper.id: paper.filename for paper in papers} paper = api.get_paper(paper_id) return render_template('admin/paper_info.html', paper=paper, id_to_filename=id_to_filename)
def analize_chapters(): api = API() papers = api.get_all_paper() introduction, background, methods, result, discussion = {}, {}, {}, {}, {} print("# papers: ", len(papers)) for paper in papers: intro_titles = [ sec.heading_proceed for sec in paper.get_introduction() ] back_titles = [sec.heading_proceed for sec in paper.get_background()] methods_titles = [sec.heading_proceed for sec in paper.get_methods()] result_titles = [sec.heading_proceed for sec in paper.get_results()] discuss_titles = [ sec.heading_proceed for sec in paper.get_discussion() ] intro_word = __is_word_in_titles(intro_titles, ["introduct"]) back_word = __is_word_in_titles(back_titles, ["relat work", "background"]) methods_word = __is_word_in_titles(methods_titles, ["method", "approach", "model"]) results_word = __is_word_in_titles(result_titles, ["result", "experi", "evalu"]) discuss_word = __is_word_in_titles( discuss_titles, ["discuss", "conclus", "futur work"]) if intro_word: introduction[intro_word] = introduction[ intro_word] + 1 if intro_word in introduction else 1 if back_word: background[back_word] = background[ back_word] + 1 if back_word in background else 1 if methods_word: methods[methods_word] = methods[ methods_word] + 1 if methods_word in methods else 1 if results_word: result[results_word] = result[ results_word] + 1 if results_word in result else 1 if discuss_word: discussion[discuss_word] = discussion[ discuss_word] + 1 if discuss_word in discussion else 1 print("introduction:") print_imrad(introduction, len(papers)) print("related work:") print_imrad(background, len(papers)) print("methods:") print_imrad(methods, len(papers)) print("result:") print_imrad(result, len(papers)) print("discussion:") print_imrad(discussion, len(papers))
def create_directed_graph(): print('\ncreate directed graph') api = API() papers = api.get_all_paper() dg = nx.DiGraph() dg.clear() for paper in papers: references = [ x.get_paper_id() for x in paper.references if x.get_paper_id() ] for ref_id in references: dg.add_edge(str(paper.id), str(ref_id)) # Data cleaning - if not done 5 papers which cite each other in dataset # preprints cited each other dg.remove_edge('5c52a9b9bf51c50be97c5145', '5c529cbdbf51c5359dce35f3') dg.remove_edge('5b0565406919df52a704f32c', '5b05673b6919df52a704f375') dg.remove_edge('5b97b226bf51c561194d9f1f', '5b05682a6919df52a704f395') dg.remove_edge('5c52a4f9bf51c50be97c5111', '5c533345bf51c5335baca21a') dg.remove_edge('5b97b0aebf51c561194d9f09', '5b97b31ebf51c561194d9f2a') print("# nodes: ", dg.number_of_nodes()) print("# edges: ", dg.number_of_edges()) print("# cycles: ", len(list(nx.simple_cycles(dg)))) print("# strongly connected components: ", len(list(nx.strongly_connected_components(dg)))) print("Dag longest path: ", len(nx.dag_longest_path(dg))) in_degrees = [] out_degrees = [] root_nodes = [] for node in dg.nodes: if len(dg.in_edges(node)) > 0: in_degrees.append(len(dg.in_edges(node))) if len(dg.out_edges(node)) > 0: out_degrees.append(len(dg.out_edges(node))) if len(dg.out_edges(node)) == 0: root_nodes.append(node) print("# root nodes: ", len(root_nodes)) print("In Degree:") print(" max degree: ", max(in_degrees)) print(" mean degree: ", round(mean(in_degrees), 4)) print(" median degree: ", statistics.median(in_degrees)) print("\nOut Degree:") print(" max degree: ", max(out_degrees)) print(" mean degree: ", round(mean(out_degrees), 4)) print(" median degree: ", statistics.median(out_degrees)) create_degree_distribution(in_degrees, 'In-Degree Distribution', '#33691e', 20, 100, 10) create_degree_distribution(out_degrees, 'Out-Degree Distribution', '#e65100')
def admin_login(): if request.method == 'GET': return redirect('admin/') api = API() if api.check_user_login(request.form['username'], request.form['password']): session['logged_in'] = True papers = api.get_all_paper() return render_template('admin/papers.html', papers=papers) else: return redirect('admin/')
def test_importance_to_sections(self): api = API() papers = api.get_all_paper() settings = {"mode": Mode.importance_to_sections} queries = paper_to_queries(papers[0], settings) self.assertEqual(len(queries["whole-document"]), 0) self.assertGreater(len(queries[IMRaDType.INTRODUCTION.name]), 0) self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0) self.assertGreater(len(queries[IMRaDType.METHODS.name]), 0) self.assertGreater(len(queries[IMRaDType.RESULTS.name]), 0) self.assertGreater(len(queries[IMRaDType.DISCUSSION.name]), 0)
def remove_link(paper_id): if not ('logged_in' in session.keys() and session['logged_in']): return redirect('admin/') api = API() api.remove__link_of_paper(paper_id, request.form['ref_paper_id']) papers = api.get_all_paper() id_to_filename = {paper.id: paper.filename for paper in papers} paper = api.get_paper(paper_id) return render_template('admin/paper_info.html', paper=paper, id_to_filename=id_to_filename)
def __link_references_to_paper(): api = API() all_papers = api.get_all_paper() finished_files = [] if not os.path.isfile(REQ_DATA_PATH + "finished_papers.txt"): with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp: pickle.dump(finished_files, fp) with open(REQ_DATA_PATH + "finished_papers.txt", 'rb') as fp: finished_files = pickle.load(fp) if os.path.isfile("newpapers.txt"): papers = [] with open("newpapers.txt", 'r') as fp: for paper_id in fp: papers.append(api.get_paper(paper_id.rstrip())) else: papers = api.get_all_paper() i = 0 for paper in papers: i += 1 print("(", i, "/", len(papers), ")") if paper.id in finished_files: continue other_papers = [p for p in all_papers if p.id != paper.id] for other_paper in other_papers: if os.path.isfile("newpapers.txt"): link_references_to_paper(other_paper, paper, api) link_references_to_paper(paper, other_paper, api) finished_files.append(paper.id) with open(REQ_DATA_PATH + "finished_papers.txt", 'wb') as fp: pickle.dump(finished_files, fp)
def test_import_intro_search_back(self): api = API() papers = api.get_all_paper() settings = { "mode": Mode.areas, "input-area": Area.Introduction, "search-area": Area.Background } queries = paper_to_queries(papers[0], settings) self.assertEqual(len(queries["whole-document"]), 0) self.assertEqual(len(queries[IMRaDType.INTRODUCTION.name]), 0) self.assertGreater(len(queries[IMRaDType.BACKGROUND.name]), 0) self.assertEqual(len(queries[IMRaDType.METHODS.name]), 0) self.assertEqual(len(queries[IMRaDType.RESULTS.name]), 0) self.assertEqual(len(queries[IMRaDType.DISCUSSION.name]), 0)
def evaluate_query_time(num_papers): api = API() papers = api.get_all_paper()[:num_papers] settings = TFIDF.get_default_config() settings["mode"] = Mode.importance_to_sections settings["use-unclassified-chapters"] = True all_elapsed_times = [] for paper in papers: start = time.time() api.get_papers_with_paper(paper.filename, settings) end = time.time() elapsed_time = start - end print("Elapsed time: ", elapsed_time) all_elapsed_times.append(elapsed_time) print("Overall time: ", sum(all_elapsed_times)) print("Mean: ", mean(all_elapsed_times))
def check_references(): print("\nCheck References") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) other_papers = [p for p in papers if p.id != paper.id] for reference in paper.references: if not reference.get_paper_id(): continue ref_paper = api.get_paper(reference.get_paper_id()) if ref_paper.cited_by.count(paper.id) == 0: print() reference.paper_id = [] api.client.update_paper(paper) repair_corrupt_reference(reference, paper, other_papers, api)
def check_cited_by(): print("\nCheck Cited by") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) for cited_paper_id in paper.cited_by: if not api.contains_paper(cited_paper_id): paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) continue cited_paper = api.get_paper(cited_paper_id) cited_paper_refs = [ref.get_paper_id() for ref in cited_paper.references if ref.get_paper_id()] if cited_paper_refs.count(paper.id) == 0: print() paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) link_references_to_paper(cited_paper, paper, api)
print("Elapsed time: ", elapsed_time) def evaluate_query_time(num_papers): api = API() papers = api.get_all_paper()[:num_papers] settings = TFIDF.get_default_config() settings["mode"] = Mode.importance_to_sections settings["use-unclassified-chapters"] = True all_elapsed_times = [] for paper in papers: start = time.time() api.get_papers_with_paper(paper.filename, settings) end = time.time() elapsed_time = start - end print("Elapsed time: ", elapsed_time) all_elapsed_times.append(elapsed_time) print("Overall time: ", sum(all_elapsed_times)) print("Mean: ", mean(all_elapsed_times)) if __name__ == "__main__": # evaluate_query_time(10) api = API() papers = api.get_all_paper() evaluate_ranking_time(papers[0], RankedBoolean.get_default_config()) evaluate_ranking_time(papers[0], DivergenceFromRandomness.get_default_config())