def click_more_details(driver): try: driver.find_element_by_xpath("//a[@id='institutionDetailsLink']").click() except: driver.find_element_by_xpath("//*[contains(text(), 'More details on this Institution')]").click() t = runi(4 + runi(-2, 1)) logger.debug('opened more details link, waiting for {} s'.format(t)) time.sleep(t) return driver
def remove_all_universities(driver): logger.debug('removing the universities from the list') time.sleep(2 + runi(-0.5, 0.5)) try: driver.find_element_by_xpath("//*[contains(text(), 'Remove all entities from this section')]").click() time.sleep(5 + runi(-1,1)) driver.find_element_by_xpath("//*[contains(text(), 'Remove all entities from this section')]").click() time.sleep(5 + runi(-1,1)) logger.debug('all institutions removed succesfully') except: logger.warning('error has occured during removing all institutions from the list') time.sleep(3 + runi(-0.5, 0.5)) return driver
def open_link(driver, link): try: driver.get(link) logger.debug('link opened successfully, {}'.format(link)) except Exception as e: logger.warning('error during opening the link, {}'.format(link)) raise t = runi(10 + runi(-3, 1)) logger.debug('opened main link, waiting for {} s'.format(t)) time.sleep(t) return driver
def open_scival_log_in(driver): link = 'https://scival.com/customer/authenticate/loginfull' try: driver.get(link) logger.debug('driver opened succesfully') except Exception as e: logger.warning('error has occured during opening browser') raise t = runi(10 + runi(-3, 1)) logger.debug('opened main link, waiting for {} s'.format(t)) time.sleep(t) return driver
def put_scival_credentials(driver): username = '******' password = read_credentials('password') username_field = driver.find_element_by_xpath("//input[@id='username']") t = runi(5 + runi(-3, 1)) time.sleep(t) password_field = driver.find_element_by_xpath("//input[@id='password-input-password']") t = runi(4 + runi(-3, 1)) time.sleep(t) username_field.send_keys(username) password_field.send_keys(password, Keys.RETURN) t = runi(10 + runi(-3, 1)) time.sleep(t) return driver
def open_scopus_link(driver): ''' open the advanced search of scopus ''' # binary = FirefoxBinary('/usr/lib/firefox/firefox') # driver = webdriver.Firefox(firefox_binary=binary) driver.implicitly_wait(2) # seconds # l = 'https://www.nobelprize.org/nobel_prizes/physics/laureates/index.html' main_scopus2 = 'https://www.scopus.com/sources?zone=&origin=NO%20ORIGIN%20DEFINED' main_search = 'https://www.scopus.com/search/form.uri?zone=TopNavBar&origin=sbrowse&display=basic' adv_search = 'https://www.scopus.com/search/form.uri?display=advanced&clear=t&origin=searchbasic&txGid=fc476bc6f6c3a112a577edd9f6f26e14' # to get to advanced search, we need to go through several links driver.get(main_scopus2) t = runi(10 + runi(-3, 1)) logger.debug('opened main link, waiting for {} s'.format(t)) time.sleep(t) # driver.implicitly_wait(10) # seconds close_pop_up_window(driver) driver.get(main_search) t = runi(10 + runi(-3, 1)) logger.debug('opened search link, waiting for {} s'.format(t)) time.sleep(t) # driver.implicitly_wait(10) # seconds close_pop_up_window(driver) driver.get(adv_search) t = runi(10 + runi(-3, 1)) logger.debug('opened advanced search link, waiting for {} s'.format(t)) time.sleep(t) # driver.implicitly_wait(10) # seconds close_pop_up_window(driver) return driver
reg_com_year = re.compile(reg_exp_year) xpath_est = '//tbody//tr//th[text()="Established"]' xpath_founded = '//tbody//tr//th[text()="Founded"]' xpath_est_date = '//tbody//tr//th[text()="Established"]/following-sibling::td' xpath_est_date_short = './following-sibling::td' # driver.find_element_by_xpath('//tbody//tr//th[@value="Established"]') # for index, row in df.iloc[:10,:].iterrows(): for index, row in df.iterrows(): time.sleep(2+runi(0.5, 1.5)) if row[cname_date] != '': continue aff = row[cname_name] logger.debug("extracting data for {}".format(aff)) aff = row[cname_name] aff_name_in_link = aff.replace(' ', '_') aff_link = link + aff_name_in_link try:
def main(n, year, metricType, ack_params, metrics_params): adv_search_link = 'https://www.scopus.com/search/form.uri?display=advanced&clear=t&origin=searchbasic&txGid=fc476bc6f6c3a112a577edd9f6f26e14' logger.debug('downloaded results table') # db_name_ack = 'acknowledgements' # coll_name_ack = 'acks_by_scival_Apr19' # db_name_metrics = 'scopus_metrics' # coll_name_metrics = 'metrics_by_scival' db_name_ack = ack_params['db_name'] coll_name_ack = ack_params['coll_name'] db_name_metrics = metrics_params['db_name'] coll_name_metrics = metrics_params['coll_name'] parent_field = 'scival_id' child_field = 'scopus_id' child_id_field = 'child_id' coll_ack = mongo_metric_ack(db_name=db_name_ack, coll_name=coll_name_ack) coll_metrics = mongo_scopus_metrics(db_name=db_name_metrics, coll_name=coll_name_metrics) # valid_ids = coll_ack.find_valid_parent_ids(metricType, str(year), n) valid_dicts = coll_ack.find_valid_parent_ids(metricType, str(year), n) logger.debug('opening browser with scopus advanced search link') timeout = 60 fp = webdriver.FirefoxProfile() fp.set_preference("http.response.timeout", timeout) fp.set_preference("dom.max_script_run_time", timeout) binary = FirefoxBinary('/usr/lib/firefox/firefox') driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=fp) try: # driver = open_scopus_link(driver) driver.get(adv_search_link) time.sleep(5) logger.debug('doing search') except: logger.warning('error has occured during opening browser') else: for valid_dict in valid_dicts: logger.debug('opening advanced search link') driver.get(adv_search_link) time.sleep(5 + runi(-1, 1)) close_pop_up_window(driver) patent_count = 0 parent_id = valid_dict[parent_field] aff_id = valid_dict[child_id_field] aff_name = valid_dict['name'] logger.debug('creating query for search') # # query = 'af-id({}) AND pubyear = {}'.format(aff_id, year) # query = '( ' + ' or '.join(['af-id({})'.format(x) for x in aff_id]) + ' )' # query = '{} AND pubyear = {}'.format(query, year) # created a query with variable query_type query = create_query(aff_id, aff_name, ack_params['query_type'], year) logger.debug('query is {}'.format(query)) q = { parent_field: parent_id, 'metricType': metricType, 'year': year } metric_response = q.copy() ack_response = q.copy() try: logger.debug('getting patent_count for {}'.format(aff_id)) patent_count = get_patent_count(driver, query) except TimeoutException as e: logger.warning('timeout error') print(e) break except Exception as e: logger.warning('error has occured') logger.warning(e) ack_response['ack'] = -1 coll_ack.update_item_by_year(parent_field, **ack_response) break else: # saving response and ack metric_response['value'] = patent_count ack_response['ack'] = 1 logger.debug( 'number of patents has been retrieved succesfully') logger.debug('number of patents for {} is {}'.format( parent_id, patent_count)) logger.debug("updating metrics and ack dbs") coll_metrics.update_item_by_year(parent_field, **metric_response) coll_ack.update_item_by_year(parent_field, **ack_response) logger.debug("updating metrics and ack dbs finished") print(ack_response) finally: driver.quit() pass return driver
def get_patent_count(driver, query): # click to activate the textField # sometimes we need to click 'contentEditLabel', sometimes 'searchfield' logger.debug('clicking on search input field') try: logger.debug('clicking on contentEditLabel') driver.find_element_by_id('contentEditLabel').click() except Exception as e: logger.debug('clicking on searchfield') driver.find_element_by_id('searchfield').click() # fill the textfield and send the request element = driver.find_element_by_id('searchfield') logger.debug('clearing search field') element.clear() # time.wait(runi(0, 0.2)) logger.debug('entering query into search field') element.send_keys(query, Keys.RETURN) t = 4 + runi(0, 1) time.sleep(t) logger.debug('waiting for page to be downloaded') # get amount of patents el = wait(driver, 60).until( EC.presence_of_element_located((By.ID, "searchResFormId"))) current_url = driver.current_url driver.get(current_url) t = 10 + runi(0, 1) time.sleep(t) # no documents found if driver_has_element_by_xpath( driver, '//div[@class="alert alert-danger"]/a[@class="close"]'): logger.debug('no document found for here') a = 0 return a else: # if documents found a = 0 try: to_wait = -5 while to_wait < 0: logger.debug('retrieving patent_count') patent_hidden_element = driver.find_element_by_id( 'hubLinksContainer') if patent_hidden_element.get_attribute('class') == 'hidden': logger.debug('no patent elements were found') patent_value = 0 to_wait = 5 elif driver.find_element_by_id('patentLink').is_displayed(): logger.debug('getting #patentLink once more') patent_element = driver.find_element_by_id('patentLink') patent_value = patent_element.text time.sleep(1) to_wait = to_wait + 1 except: driver.find_element_by_xpath( "//button[@title='Edit search query']").click() assert 'something happened' else: logger.debug('patent_value is {}'.format(patent_value)) a = extract_integer_(patent_value) logger.debug('extracted integer is {}'.format(a)) t = 2 + runi(0, 1) time.sleep(t) try: logger.debug('clicking on editAuthSearch') driver.find_element_by_id('editAuthSearch').click() except: logger.debug('error during clicking on editAuthSearch') logger.debug('instead find "Edit search query" field by force') driver.find_element_by_xpath( "//button[@title='Edit search query']").click() t = 2 + runi(0, 1) time.sleep(t) return a
try: table, extracted_name, child_count = extract_table(driver) logger.debug('table extracted succesfully') except: logger.warning('table could not be extracted') else: logger.debug('saving the child_ids of {}'.format(valid_ids[i])) table.to_excel('data/child_id/{}.xlsx'.format(valid_ids[i]), index=False) logger.debug('saving acknoledge to the table') df.loc[df.id == valid_ids[i], 'scopus_id_downloaded'] = 1 df.to_csv(fname, index=False) time.sleep(5 + runi(-1, 1)) parent_aff = {'name': valid_names[i], 'scival_id': valid_ids[i] } append_scopus_ids_to_parent(db_ids, table, parent_aff) logger.debug('opening link to remove all universities') driver = open_link(driver, a) driver = remove_all_universities(driver) time.sleep(3 + runi(-1, 1)) except: