def parse(self, response): # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Monitor the process by url monitor_url = response.url citation_id = response.url.split("cites=")[1].split("&")[0].replace( ",", "_") citations = [] # Make request, get response and extract content self.driver.get(response.url) page_response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') self.parse_one_page(page_response, citations) # Click 'Next page' and get content until this button is disabled while True: try: # Wait 15s before next click time.sleep(15) # Click button 'Next' when it is not hidden self.driver.find_element_by_xpath( "//div[@id='gs_n']//td[@align='left']/a").click() # Get content, parse and monitor next_page_response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') self.parse_one_page(next_page_response, cited_by_list) monitor_url = self.driver.current_url except: self.driver.close() break # Output csv_path = get_path([self.ROOT_DIR, "data", "citations", self.input_dir, self.input_file.split(".")[0], \ "papers-of-citeID-{}.csv".format(citation_id)]) write_csv(citations, csv_path) pkl_path = get_path([self.ROOT_DIR, "data", "citations", self.input_dir, self.input_file.split(".")[0], \ "papers-of-citeID-{}.pkl".format(citation_id)]) write_pickle(citations, pkl_path) monitor_file = get_path([self.ROOT_DIR, "data", "monitors", "crawled_papers_{}_{}"\ .format(self.input_dir.replace("authors_",""), self.input_file.replace("papers-of-",""))]) monitor_crawler(monitor_file, monitor_url)
def parse(self, response): print("hello") # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Make request self.driver.get(response.url) time.sleep(30) try: org_id = '' org_ids = response.xpath( "//*[@class='gs_ob_inst_r']//a//@href").getall() for i in org_ids: org_id = org_id.join( i.split('org=')[1].split('&')[0].join(' | ')) except Exception as e: print(e) org_id = 'Not found | {}'.format(response.url) name_for_searching_gs = response.url.split('&q=')[1].split('&')[0] with open('top_asia_id.csv', "a") as f: f.write("\n%s, %s" % (name_for_searching_gs, org_id)) f.close() self.driver.close()
def parse(self, response): # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Get user_id from URL user_id = response.url.split("user="******"&")[0] # Make request self.driver.get(response.url) # Get all rows in the table from webdriver response, each paper is in a row rows = response.xpath("//table[@id='gsc_rsb_st']//tbody//td[@class='gsc_rsb_std']") citations_all = int(rows[0].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 citations_s2014 = int(rows[1].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 hindex_all = int(rows[2].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 hindex_s2014 = int(rows[3].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 i10index_all = int(rows[4].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 i10index_s2014 = int(rows[5].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0 coauthor_ids = '' coauthor_names = '' try: self.driver.find_element_by_xpath("//button[@id='gsc_coauth_opn']").click() coauthors_response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') (coauthor_ids, coauthor_names) = self.parse_coauthors(coauthors_response) except: pass output_path = get_path([self.ROOT_DIR, "data", "sample_comparision", "hindex.csv"]) with open(output_path, "a") as f: f.write("\n%s, %d, %d, %d, %d, %d, %d, %s, %s" % (user_id,citations_all,citations_s2014,\ hindex_all,hindex_s2014,i10index_all,i10index_s2014, \ coauthor_ids, coauthor_names)) f.close() self.driver.close() time.sleep(30)
def start_requests(self): organizations_path = get_path( [self.ROOT_DIR, "data", "organizations.txt"]) organizations_df = pd.read_csv(organizations_path) urls = organizations_df['URL'].to_list() for url in urls[len(urls) - 1:]: yield scrapy.Request(url=url, callback=self.parse)
def start_requests(self): authors_path = get_path([self.ROOT_DIR, "data", "authors", self.input_file]) authors_df = pd.read_csv(authors_path) urls = ('https://scholar.google.com/citations?hl=en&user='******'AuthorID'].astype(str))\ .to_list() for url in urls: yield scrapy.Request(url=url, callback=self.parse)
def start_requests(self): urls = [] try: if self.input_file == '': path = get_path( [self.ROOT_DIR, "data", "papers", self.input_dir, "*.csv"]) for path_file in glob.glob(path): df = pd.read_csv(path_file) urls.extend(df['Cited_url'].dropna().to_list()) else: path = get_path([ self.ROOT_DIR, "data", "papers", self.input_dir, self.input_file ]) urls = pd.read_csv(path)['Cited_url'].dropna().to_list() except: raise for url in urls: yield scrapy.Request(url=url, callback=self.parse)
import pandas as pd from configuration import ROOT_DIR from utils.mapping import get_id_orgs, get_name_orgs from utils.tools import get_path eurecom_id = get_id_orgs(names=['eurecom'])['eurecom'] auhors_path = get_path([ROOT_DIR, "data", "authors", 'authors_orgID_{}.csv'.format(eurecom_id)]) authors_df = pd.read_csv(auhors_path) print(authors_df.head(5))
def parse(self, response): # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Extract organization id org_id = response.url.split("org=")[1].split("&")[0] # List of authors of the organization authors = [] monitor_url = '' # Make request self.driver.get(response.url) # First page of the request page_response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') self.parse_one_page(page_response, authors) # Click 'Next page' and parse until this button is disabled while True: try: # Wait 15s before next click time.sleep(15) # Find button 'Next' when it is not hidden self.driver.find_element_by_xpath( "//div[@id='gsc_authors_bottom_pag']//button[@aria-label='Next'][not(@disabled)]" ).click() # Create a variable for response from webdriver next_page_response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') self.parse_one_page(next_page_response, authors) monitor_url = self.driver.current_url except: self.driver.close() break # Output if len(authors) > 0: csv_path = get_path([ self.ROOT_DIR, "data", "authors", "authors_orgID_{}.csv".format(org_id) ]) write_csv(authors, csv_path) pkl_path = get_path([ self.ROOT_DIR, "data", "authors", "authors_orgID_{}.pkl".format(org_id) ]) write_pickle(authors, pkl_path) monitor_file = get_path([ self.ROOT_DIR, "data", "monitors", 'crawled_organizations.txt' ]) monitor_crawler(monitor_file, monitor_url)
def parse(self, response): # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Extract user_id user_id = response.url.split("user="******"&")[0] # Make request self.driver.get(response.url) response_after_click = None # Click 'Show more' until the button is disabled while True: try: # Wait 15 seconds before next click time.sleep(15) # Click button 'Show more' while it is not disabled self.driver.find_element_by_xpath('//button[@id="gsc_bpf_more"][not(@disabled)]').click() except: # Get response after clicking completely response_after_click = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') # Close webdriver self.driver.close() break # Create a list of papers for the given user papers = [] # Get all rows in the table from webdriver response, each paper is in a row rows = response_after_click.xpath("//table[@id='gsc_a_t']//tr[@class='gsc_a_tr']") if (len(rows)>0): # In each paper, get title, authors, conference, url of citations, count of citations and year of publication for row in rows: title = row.xpath(".//td[@class='gsc_a_t']/a/text()").get() authors = row.xpath(".//td[@class='gsc_a_t']/div[1]/text()").get() conference = row.xpath(".//td[@class='gsc_a_t']/div[2]/text()").get() cited_url = row.xpath(".//td[@class='gsc_a_c']/a[@class='gsc_a_ac gs_ibl']/@href").get() cited_count = row.xpath(".//td[@class='gsc_a_c']/a/text()").get() year = row.xpath(".//td[@class='gsc_a_y']/span/text()").get() cited_count = 0 if cited_count is None else int(cited_count) year = 0 if year is None else int(year) # Create a dictionary for each paper paper = { 'Title': title, 'Authors': authors, 'Platform': conference, 'Cited_url': cited_url, 'Cited_count': cited_count, 'Year': year } papers.append(paper) # Output organization = self.input_file.split('.')[0] csv_path = get_path([self.ROOT_DIR, "data", "papers", organization, \ "papers-of-authorID-{}.csv".format(user_id)]) write_csv(papers, csv_path) pkl_path = get_path([self.ROOT_DIR, "data", "papers", organization, \ "papers-of-authorID-{}.pkl".format(user_id)]) write_pickle(papers, pkl_path) monitor_file = get_path([self.ROOT_DIR, "data", "monitors", 'crawled_{}'.format(self.input_file)]) monitor_crawler(monitor_file, user_id)
def parse(self, response): # Configure webdriver if platform.system() == 'Windows': chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver.exe"]) else: chromedriver_path = get_path( [self.ROOT_DIR, "libs", "chromedriver"]) op = webdriver.ChromeOptions() op.add_argument('headless') self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op) self.driver.implicitly_wait(60) # Get user_id from URL user_id = response.url.split("user="******"&")[0] # Make request self.driver.get(response.url) info_tags = response.xpath( "//div[@id='gsc_prf_i']//div[@class='gsc_prf_il']") info_texts = info_tags.xpath(".//text()").getall() info_links = info_tags.xpath(".//a/@href").getall() info_texts_str = " | ".join(info_texts) info_links_str = " | ".join(info_links) # Get all rows in the table from webdriver response, each paper is in a row rows = response.xpath( "//table[@id='gsc_rsb_st']//tbody//td[@class='gsc_rsb_std']") citations_all = int(rows[0].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 citations_s2014 = int(rows[1].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 hindex_all = int(rows[2].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 hindex_s2014 = int(rows[3].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 i10index_all = int(rows[4].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 i10index_s2014 = int(rows[5].xpath("text()").get()) if rows[0].xpath( "text()").get() is not None else 0 coauthor_ids = '' coauthor_names = '' try: self.driver.find_element_by_xpath( "//button[@id='gsc_coauth_opn']").click() coauthors_response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') (coauthor_ids, coauthor_names) = self.parse_coauthors(coauthors_response) except: pass info = { "AuthorID": user_id, "CitationsAll": citations_all, "CitationsS2014": citations_s2014, "hIndexAll": hindex_all, "hIndexS2014": hindex_s2014, "i10IndexAll": i10index_all, "i10IndexS2014": i10index_s2014, "CoAuthorIDs": coauthor_ids, "CoAuthorNames": coauthor_names, "Description": info_texts_str, "DescriptionURLs": info_links_str } # Output csv_path = get_path([ self.ROOT_DIR, "data", "info", "info-{}.csv".format(self.author_file.split(".")[0]) ]) with open(csv_path, "a") as f: f.write("\n%s, %d, %d, %d, %d, %d, %d, %s, %s, %s, %s" % \ (user_id,citations_all,citations_s2014,\ hindex_all,hindex_s2014,i10index_all,i10index_s2014, \ coauthor_ids, coauthor_names, info_texts_str,info_links_str,)) f.close() monitor_file = get_path([ self.ROOT_DIR, "data", "monitors", 'crawled_info_{}'.format(self.author_file) ]) monitor_crawler(monitor_file, user_id) self.driver.close() time.sleep(30)