def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException("Professor id is ambiguous during search ... More than 1 result") search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar") search_dup.delete() faculty = search_results[0] if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) search_dup = Document.search().query( 'match', faculty_id=faculty.faculty_id).query("match", source="ResearchId") search_dup.delete() faculty = search_results[0] if faculty.research_id is not None: scraper = ScraperFactory.create_scraper( faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data[ "description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: list of faculty members """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="GoogleScholar") \ .delete() if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) scrapps = scraper.get_scrapps() for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def create_grant(json_data, write=True): """Creates an instance of Faculty from a JSON representation. :param dict json_data: Dictionary representation of the JSON data. :param bool write: Boolean switch that will enable writing to elastic. """ schema = GrantSchema() try: grant = schema.load(json_data) except ValidationError as err: raise DataIngestionException( "Missing one of the required fields of the schema. {}".format( err.messages)) # Need to find a faculty with matching name so we can build a new document search_results = Faculty.search().query( 'match', full_name=grant["faculty_name"]).execute() if len(search_results) < 1: return faculty = search_results[0] # TODO: There is no spot for titles in the document... grant_doc = Document(faculty_id=faculty.faculty_id, source=grant["source"], text=grant["text"]) if write: grant_doc.save()
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty
def run(self, data): """Updates a Faculty members information in Elasticsearch, based on the result of a scrape. :param data: list of tuples of form <str, Scrapp> :return: The updated instance of a Faculty model. """ faculty_name = data[0] scrapp = data[1] search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Faculty name is ambiguous during search... More than 1 result" ) faculty = search_results[0] Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="Profile") \ .delete() if "orcid_link" in scrapp.meta_data: faculty.orc_id = scrapp.meta_data["orcid_link"] if "researchid_link" in scrapp.meta_data: faculty.research_id = scrapp.meta_data["researchid_link"] if "googlescholar_link" in scrapp.meta_data: faculty.google_scholar = scrapp.meta_data["googlescholar_link"] if "text" in scrapp.meta_data: doc_search = Document.search().query('match', faculty_id=faculty.faculty_id) \ .query('match', source = "profile") \ .execute() try: doc = doc_search[0] except IndexError: doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "profile" doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() faculty.save() return faculty