def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: list of faculty members """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="GoogleScholar") \ .delete() if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def run(self, data): """Updates a Faculty members information in Elasticsearch, based on the result of a scrape. :param data: list of tuples of form <str, Scrapp> :return: The updated instance of a Faculty model. """ faculty_name = data[0] scrapp = data[1] search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Faculty name is ambiguous during search... More than 1 result" ) faculty = search_results[0] Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="Profile") \ .delete() if "orcid_link" in scrapp.meta_data: faculty.orc_id = scrapp.meta_data["orcid_link"] if "researchid_link" in scrapp.meta_data: faculty.research_id = scrapp.meta_data["researchid_link"] if "googlescholar_link" in scrapp.meta_data: faculty.google_scholar = scrapp.meta_data["googlescholar_link"] if "text" in scrapp.meta_data: doc_search = Document.search().query('match', faculty_id=faculty.faculty_id) \ .query('match', source = "profile") \ .execute() try: doc = doc_search[0] except IndexError: doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "profile" doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() faculty.save() return faculty
def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException("Professor id is ambiguous during search ... More than 1 result") search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar") search_dup.delete() faculty = search_results[0] if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) search_dup = Document.search().query( 'match', faculty_id=faculty.faculty_id).query("match", source="ResearchId") search_dup.delete() faculty = search_results[0] if faculty.research_id is not None: scraper = ScraperFactory.create_scraper( faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data[ "description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def get(self): """HTTP Get for the document list resource. Returns a list of faculty members from elasticsearch. :param page: URL Parameter for the page to fetch. Default - 0. :param results: URL Parameter for the number of results to return per page. Default - 20. :param id: URL Parameter to filter the results based on a faculty id. :param source: URL Parameter to filter the results based on the document source. :return: """ page = request.args.get("page", default=0, type=int) results = request.args.get("results", default=20, type=int) id = request.args.get("id", type=int) source = request.args.get("source", type=str) # Get the slice of data to retrieve first = page * results last = (page * results) + results search = Document.search() # Apply filters based on id and source if given if id is not None: search = search.filter('match', faculty_id=id) if source is not None: search = search.filter('match', source=source) count = search.count() query = search[first:last] response = query.execute() schema = DocumentSchema() results = [schema.dump(document) for document in response] has_previous = True if page > 0 else False has_next = True if last < count else False previous = page - 1 if has_previous else None next = page + 1 if has_next else None return { "pagination": { "has_previous": has_previous, "has_next": has_next, "previous_page": previous, "current_page": page, "next_page": next, }, "data": results }