def run(self, data): """Updates a Faculty members information in Elasticsearch, based on the result of a scrape. :param data: list of tuples of form <str, Scrapp> :return: The updated instance of a Faculty model. """ faculty_name = data[0] scrapp = data[1] search_results = Faculty.search().query('match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Faculty name is ambiguous during search... More than 1 result" ) faculty = search_results[0] Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="Profile") \ .delete() if "orcid_link" in scrapp.meta_data: faculty.orc_id = scrapp.meta_data["orcid_link"] if "researchid_link" in scrapp.meta_data: faculty.research_id = scrapp.meta_data["researchid_link"] if "googlescholar_link" in scrapp.meta_data: faculty.google_scholar = scrapp.meta_data["googlescholar_link"] if "text" in scrapp.meta_data: doc_search = Document.search().query('match', faculty_id=faculty.faculty_id) \ .query('match', source = "profile") \ .execute() try: doc = doc_search[0] except IndexError: doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "profile" doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() faculty.save() return faculty
def run(self, data): """Performs a scraping of a faculty members GoogleScholar page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="GoogleScholar") \ .delete() if faculty.google_scholar is not None and "http" in faculty.google_scholar: scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR) try: scrapps = scraper.get_scrapps() except ScraperException: return faculty for scrapp in scrapps: doc = Document() doc.source = "GoogleScholar" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.date = datetime.now() doc.save() return faculty
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() Keywords.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", approach_id="4") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) try: scrapps = scraper.get_scrapps() except ScraperException: return faculty keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" keywords = Keywords() keywords.faculty_id = faculty.faculty_id keywords.datasource = "user_keywords" keywords.approach_id = "4" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] keywords.keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.date = datetime.now() doc.save() keywords.save() for scrapp in titles: doc = Document() if scrapp.data_source == ScraperType.RESEARCHID: doc.source = "ResearchId" else: doc.source = "ResearchIdAbstract" doc.faculty_id = faculty.faculty_id if scrapp.data_source == ScraperType.RESEARCHID: doc.text = scrapp.title else: doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() return faculty