def getUniversitiesID(): global university_names global universities_id_scopus global universities_names_scopus bf.getElementByXpath(AFFILIATIONS_TAB_XPATH).click() for name in university_names: bf.getElementByXpath(AFFILIATION_INPUT_XPATH).click() bf.driver.find_element_by_xpath(AFFILIATION_INPUT_XPATH).clear() name_search = (name.split(" ("))[0] bf.driver.find_element_by_xpath(AFFILIATION_INPUT_XPATH).send_keys(name_search) time.sleep(2) bf.getElementByXpath(SEARCH_AFFILIATION_BUTTON_XPATH).click() try: #click on the uni link bf.getElementByXpath("/html/body/div[1]/div/div[1]/div[1]/div/div[3]/form/div[4]/div[2]/div/div/div[3]/table/tbody/tr/td[1]/div/div[1]/span/a").click() university_id = WebDriverWait(bf.driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "affId"))).text print(university_id) university_id = university_id.replace("Affiliation ID: ","") universities_id_scopus[str(name)] = str(university_id) university_name = WebDriverWait(bf.driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "wordBreakWord"))).text universities_names_scopus[str(name)] = str(university_name) except: print("University name not found: "+ name) universities_id_scopus[str(name)] = None universities_names_scopus[str(name)] = None finally: bf.getElementByXpath("/html/body/div[1]/div/div[1]/header/div[2]/a/span").click() return universities_id_scopus
def getDataBetweenTwoYears(startYear, endYear): base_functions.openBrowser() base_functions.changeURL(qs_arab_url) time.sleep(5) base_functions.getElementByXpath(ACCEPT_BUTTON_XPATH).click() for year in range(startYear, endYear+1): data = scrapeData(qs_arab_url + str(year)) base_functions.copyToFile(data, base_functions.results_folder_name + str(year)) base_functions.closeBrowser()
def getCitaions(uni, year): bf.getElementByXpath(SELECT_ALL_ARROW_XPATH).click() bf.getElementByXpath(SELECT_ALL_LABEL_XPATH).click() bf.getElementByXpath(VIEW_CITATIONS_BUTTON_XPATH).click() print("\nLoad:") waitPageLoading() time.sleep(2) waitPageLoading() bf.getElementById(START_YEAR_ID).click() year_path_map = getStartYearPaths() bf.getElementByXpath(year_path_map['2014']).click() #NOT TO BE HARDCODED. We should get current year - 5 (or 4) bf.getElementById(UPDATE_OVERVIEW_BUTTON_ID).click() waitPageLoading() citations = {} citations["uni"] = uni citations["publications_year"] = year for citation_year in range(2014,2022): #NOT TO BE HARDCODED. We should get current year - 5 (or 4) till - 2 try: path = "//*[@id='year_" + str(citation_year) + "']/a/span/strong" nb_citation = bf.driver.find_element_by_xpath(path).get_attribute("innerHTML") except: path = "//*[@id='year_" + str(citation_year) + "']" nb_citation = bf.getElementByXpath(path).text citations[str(citation_year)] = nb_citation return citations
def getDocumentsBySubject(uni, year): bf.getElementById(ANALYSE_SEARCH_RESULTS_ID).click() bf.getElementById(ANALYSE_SUBJECT_MINIGRAPH_ID).click() documents = {} documents["uni"] = uni documents["year"] = year nb_of_subjects = len(bf.driver.find_elements_by_xpath(SUBJECT_AREA_TABLE_XPATH)) for i in range(1,nb_of_subjects+1): path = "/html/body/div[1]/div/div[1]/div[2]/div/div[3]/form/div[2]/section[2]/div/div[7]/div[1]/div/table/tbody/tr[" + str(i) + "]" subject = bf.getElementByXpath(path + "/td[1]").text nb_documents = bf.getElementByXpath(path + "/td[2]/a/span").text documents[subject] = nb_documents return documents
def getStartYearPaths(): dict = {} i=1 while(i<11): path = "/html/body/div[2]/ul/li["+str(i)+"]/div" content = bf.getElementByXpath(path).text dict[content] = path i=i+1 return dict
def executeQuery(query_filename, scopusNames_filename, scopusId_filename): publications_list = [] citations_list = [] pub_per_subject_list = [] error_list = [] query = bf.loadFile(query_filename) names = bf.loadJsonFile(scopusNames_filename) ids = bf.loadJsonFile(scopusId_filename) for key in names.keys(): #key = "King Abdulaziz University (KAU)" for year in range(2014,2022): #to be changed to only current year try: if(names[key] != None): uni_query = query.replace("*UNIVERSITY_NAME*",names[key]).replace("*UNIVERSITY_ID*",ids[key]).replace("*YEAR*",str(year)) enterQuery(uni_query) nb_publications = getNbOfDocuments(key,year) if(int(nb_publications.replace(',',''))): print("Error in" + key +" " + year) error_list.append(key + "-" + str(year)) bf.changeURL(scopus_url) else: citations = getCitaions(key, year) goBack() documents_per_subject = getDocumentsBySubject(key, year) bf.getElementByXpath("/html/body/div[1]/div[1]/div[1]/header/div[2]/a/span").click() publications_list.append(nb_publications) citations_list.append(citations) pub_per_subject_list.append(documents_per_subject) except: print("Error in" + key +" " + year) error_list.append(key + "-" + str(year)) bf.changeURL(scopus_url) # print("publications_list") # print(publications_list) # print("citations_list") # print(citations_list) # print("pub_per_subject_list") # print(pub_per_subject_list) return [publications_list, citations_list, pub_per_subject_list, error_list]
def goBackToResults(): bf.getElementByXpath(BACK_TORESULTS_BUTTON_XPATH).click()
def goBack(): bf.getElementByXpath(BACK_BUTTON_XPATH).click()
def enterQuery(query): bf.driver.find_element_by_xpath(QUERY_STRING_INPUT_XPATH).clear() bf.driver.find_element_by_xpath(QUERY_STRING_INPUT_XPATH).send_keys(query) bf.getElementByXpath(SEARCH_BUTTON_XPATH).click()
def gotoAdvancedSearch(): bf.getElementByXpath(ADVANCED_SEARCH_BUTTON_XPATH).click()
def login(): bf.getElementByXpath(SIGNIN_BUTTON_XPATH).click() bf.driver.find_element_by_xpath(EMAIL_INPUT_XPATH).send_keys(scopus_email_address) bf.getElementByXpath(CONTINUE_BUTTON_XPATH).click() bf.driver.find_element_by_xpath(PASSWORD_INPUT_XPATH).send_keys(scopus_password) bf.getElementByXpath(SUBMIT_SIGN_IN_BUTTON_XPATH).click()