def setup_class(cls): Keywords.search().delete() Faculty.search().delete() sleep(3) will = Faculty(meta={"id": 379}, name="William.Allison", full_name="Allison, William.", faculty_id=379, email="*****@*****.**") will.save() will_keywords = Keywords( faculty_id=379, datasource="test", approach_id=0, keywords=["zebrafish", "evolutionary processes", "marine"]) will_keywords.save() vince = Faculty(meta={"id": 356}, name="Vincent.Bouchard", full_name="Bouchard, Vincent", faculty_id=356, email="*****@*****.**") vince.save() vince_keywords = Keywords( faculty_id=356, datasource="test", approach_id=0, keywords=["string theory", "number theory", "mathematics"]) vince_keywords.save() sleep(3)
def test_create__success(self): link = 'http://www.researcherid.com/rid/A-2612-2014' rid = ResearchIdPageScrape() obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**") obj.researcherid = link res = rid.run(obj) print(res) assert res != None
def test_create__success(self): link = 'https://scholar.google.ca/citations?user=KffJRdgAAAAJ&hl=en&oi=sra' ga = GoogleScholarPageScrape() obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**") obj.google_scholar = link res = ga.run(obj) print(res) assert res != None
def test_create__success(self): link = 'http://www.researcherid.com/rid/C-6729-2008' rid = ResearchIdPageScrape() obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**") obj.research_id = link res = rid.is_requirement_satisfied(obj) assert res is True res = rid.run(obj) print(res) assert res is not None
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) search_dup = Document.search().query( 'match', faculty_id=faculty.faculty_id).query("match", source="ResearchId") search_dup.delete() faculty = search_results[0] if faculty.research_id is not None: scraper = ScraperFactory.create_scraper( faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data[ "description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: list of faculty members """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="GoogleScholar") \ .delete() if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def create_grant(json_data, write=True): """Creates an instance of Faculty from a JSON representation. :param dict json_data: Dictionary representation of the JSON data. :param bool write: Boolean switch that will enable writing to elastic. """ schema = GrantSchema() try: grant = schema.load(json_data) except ValidationError as err: raise DataIngestionException( "Missing one of the required fields of the schema. {}".format( err.messages)) # Need to find a faculty with matching name so we can build a new document search_results = Faculty.search().query( 'match', full_name=grant["faculty_name"]).execute() if len(search_results) < 1: return faculty = search_results[0] # TODO: There is no spot for titles in the document... grant_doc = Document(faculty_id=faculty.faculty_id, source=grant["source"], text=grant["text"]) if write: grant_doc.save()
def add_name_search_results(faculty_with_keywords, pf_query): """ Inserts results of pf_query on faculty index into faculty_with_keywords. If an faculty is returned from the query, but does not currently exist in faculty_with_words, the faculty member plus their entire keyword set, is inserted into the dictionary. :param faculty_with_keywords: Dictionary of faculty id's to keywords. :param pf_query: Postfix query created by the Query Builder. :returns: faculty_with_keywords also containing faculty whose names match the query. """ # Add functionality of searching names in query. q_builder = builder.QueryBuilder() name_elastic_query = q_builder.build(pf_query, search_field="full_name") names_response = Faculty.search().query(name_elastic_query).execute() for faculty in names_response: # We already have the faculty who was searched in the results. if faculty.faculty_id in faculty_with_keywords: continue faculty_keywords = Keywords.search()\ .query('match', faculty_id=faculty.faculty_id).execute() faculty_with_keywords[faculty.faculty_id] = faculty_keywords
def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException("Professor id is ambiguous during search ... More than 1 result") search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar") search_dup.delete() faculty = search_results[0] if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def get(self): """HTTP Get that enables boolean query processing and search.""" query = request.args.get('query') if query is None: abort(400) q_parser = parser.QueryParser() q_builder = builder.QueryBuilder() pf_query = q_parser.parse_query(query) elastic_query = q_builder.build(pf_query) # response = Faculty.search().query(elastic_query).execute() response = Keywords.search().query(elastic_query).execute() faculty_with_keywords = set() for keywords in response: faculty_with_keywords.add(keywords.faculty_id) schema = FacultySchema() results = [ schema.dump(Faculty.safe_get(faculty_id)) for faculty_id in faculty_with_keywords ] return {"data": results}
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def run(self, data): """ Searches through all results in elastic search :param data: str or Faculty instance. :return: all faculty """ s = Faculty.search() allFaculty = [faculty for faculty in s.scan()] return allFaculty
def run(self, data): """Updates a Faculty members information in Elasticsearch, based on the result of a scrape. :param data: list of tuples of form <str, Scrapp> :return: The updated instance of a Faculty model. """ faculty_name = data[0] scrapp = data[1] search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Faculty name is ambiguous during search... More than 1 result" ) faculty = search_results[0] Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="Profile") \ .delete() if "orcid_link" in scrapp.meta_data: faculty.orc_id = scrapp.meta_data["orcid_link"] if "researchid_link" in scrapp.meta_data: faculty.research_id = scrapp.meta_data["researchid_link"] if "googlescholar_link" in scrapp.meta_data: faculty.google_scholar = scrapp.meta_data["googlescholar_link"] if "text" in scrapp.meta_data: doc_search = Document.search().query('match', faculty_id=faculty.faculty_id) \ .query('match', source = "profile") \ .execute() try: doc = doc_search[0] except IndexError: doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "profile" doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() faculty.save() return faculty
def test_create(self): prof = Faculty() prof.name = "name" prof.email = "*****@*****.**" prof.faculty_id = 1 prof.department = "cs" assert prof.department == "cs"
def get(self, faculty_id): """ HTTP Get for the faculty resource. Currently returns an HTML page, but should instead return the Faculty object as JSON. :param faculty_id: The id as is in elasticsearch. This id is defined by the forum data dump. :return:HTTP 404 if the given ID does not exist. HTTP 200 if the id exists and the GET operation succeeds. """ faculty = Faculty.safe_get(faculty_id) if faculty is None: abort(404) return make_response(render_template("faculty.html", faculty=faculty), 200, {'content-type': 'text/html'})
def get(self): """HTTP Get for the faculty list resource. Returns a list of faculty members from elasticsearch. :param page: URL Parameter for the page to fetch. Default - 0. :param results: URL Parameter for the number of results to return per page. Default - 20. :return: """ search = Faculty.search() query, pagination_info = paginate_query(request, search) response = query.execute() schema = FacultySchema() results = [schema.dump(faculty) for faculty in response] return {"pagination": pagination_info, "data": results}
def post(self): """HTTP Get that enables boolean query processing and search.""" to_run = request.args.get('run') faculty = request.args.get('faculty') if not to_run or not faculty: abort(400) try: task_list = TASKLIST[to_run] except: abort(400) workflow = Workflow(task_list, Faculty.safe_get(faculty)) run_workflow.apply_async((workflow,), countdown=1) return 200
def get(self): """HTTP Get that enables boolean query processing and search.""" query = request.args.get('query') if query is None: abort(400) q_parser = parser.QueryParser() q_builder = builder.QueryBuilder() pf_query = q_parser.parse_query(query) elastic_query = q_builder.build(pf_query) response = Faculty.search().query(elastic_query).execute() schema = FacultySchema() results = [schema.dump(faculty) for faculty in response] return {"data": results}
def get(self): """HTTP Get for the faculty list resource. Returns a list of faculty members from elasticsearch. :param page: URL Parameter for the page to fetch. Default - 0. :param results: URL Parameter for the number of results to return per page. Default - 20. :return: """ page = request.args.get("page", default=0, type=int) results = request.args.get("results", default=20, type=int) # Get the slice of data to retrieve first = page * results last = (page * results) + results search = Faculty.search() count = search.count() query = search[first:last] response = query.execute() schema = FacultySchema() results = [schema.dump(faculty) for faculty in response] has_previous = True if page > 0 else False has_next = True if last < count else False previous = page - 1 if has_previous else None next = page + 1 if has_next else None return { "pagination": { "has_previous": has_previous, "has_next": has_next, "previous_page": previous, "current_page": page, "next_page": next, }, "data": results }
def create_results(faculty_with_keywords, dept_filter): """ Creates the json representation of a faculty member, including all keywords. :param faculty_with_keywords: A dictionary of id's to lists of keywords. The keywords are inserted into the faculty object before being dumped to json. :param dept_filter: List of string departments to be included in the results. If a professor does not belong one of the departments, they are not included. All professors are included if the filter is empty. :returns: List of JSON objects, each representing a faculty member and keywords. """ # Build json representations with nested keywords schema = FacultySchema() results = [] for faculty_id, keywords in faculty_with_keywords.items(): faculty = Faculty.safe_get(faculty_id) if faculty is None or \ (len(dept_filter) > 0 and faculty.department not in dept_filter): continue faculty.generated_keywords = keywords results.append(schema.dump(faculty)) return results
if isinstance(data, Faculty): return FacultyNames.validate_name(data.name) def run(self, data): """Performs a scraping of a faculty members directory page. :param data: str or Faculty instance. :return: tuple of the faculty name and Scrapp produced by scraping the faculty directory page. """ print("Running {} on {}".format(self.task_name, data)) if isinstance(data, str): faculty_name = data else: faculty_name = data.name faculty_directory_url = URLs.build_faculty_url(faculty_name) scraper = ScraperFactory.create_scraper(faculty_directory_url, ScraperType.PROFILE) scrapp = scraper.get_scrapps()[0] ret_data = (data, scrapp) return ret_data if __name__ == "__main__": from elasticsearch_dsl import connections connections.create_connection() Faculty.init()
search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar") search_dup.delete() faculty = search_results[0] if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty if __name__ == "__main__": from elasticsearch_dsl import connections connections.create_connection() Faculty.init() Document.init() search = Faculty.search() allFaculty = [faculty for faculty in search.scan()] task = GoogleScholarPageScrape() task.run(allFaculty)
def teardown_class(cls): Faculty.get(id=379).delete() Faculty.get(id=356).delete() Keywords.search().query('match', faculty_id=379).delete() Keywords.search().query('match', faculty_id=356).delete()