def parse(self, response):
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path,
                                       options=op)
        self.driver.implicitly_wait(60)

        # Monitor the process by url
        monitor_url = response.url

        citation_id = response.url.split("cites=")[1].split("&")[0].replace(
            ",", "_")

        citations = []

        # Make request, get response and extract content
        self.driver.get(response.url)
        page_response = TextResponse(url=response.url,
                                     body=self.driver.page_source,
                                     encoding='utf-8')
        self.parse_one_page(page_response, citations)

        # Click 'Next page' and get content until this button is disabled
        while True:
            try:
                # Wait 15s before next click
                time.sleep(15)

                # Click button 'Next' when it is not hidden
                self.driver.find_element_by_xpath(
                    "//div[@id='gs_n']//td[@align='left']/a").click()

                # Get content, parse and monitor
                next_page_response = TextResponse(url=self.driver.current_url,
                                                  body=self.driver.page_source,
                                                  encoding='utf-8')
                self.parse_one_page(next_page_response, cited_by_list)
                monitor_url = self.driver.current_url
            except:
                self.driver.close()
                break

        # Output
        csv_path = get_path([self.ROOT_DIR, "data", "citations", self.input_dir, self.input_file.split(".")[0], \
                            "papers-of-citeID-{}.csv".format(citation_id)])
        write_csv(citations, csv_path)
        pkl_path = get_path([self.ROOT_DIR, "data", "citations", self.input_dir, self.input_file.split(".")[0], \
                            "papers-of-citeID-{}.pkl".format(citation_id)])
        write_pickle(citations, pkl_path)

        monitor_file = get_path([self.ROOT_DIR, "data", "monitors", "crawled_papers_{}_{}"\
                                .format(self.input_dir.replace("authors_",""), self.input_file.replace("papers-of-",""))])
        monitor_crawler(monitor_file, monitor_url)
Example #2
0
    def parse(self, response):
        print("hello")
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path,
                                       options=op)
        self.driver.implicitly_wait(60)
        # Make request
        self.driver.get(response.url)
        time.sleep(30)

        try:
            org_id = ''
            org_ids = response.xpath(
                "//*[@class='gs_ob_inst_r']//a//@href").getall()
            for i in org_ids:
                org_id = org_id.join(
                    i.split('org=')[1].split('&')[0].join(' | '))
        except Exception as e:
            print(e)
            org_id = 'Not found | {}'.format(response.url)

        name_for_searching_gs = response.url.split('&q=')[1].split('&')[0]

        with open('top_asia_id.csv', "a") as f:
            f.write("\n%s, %s" % (name_for_searching_gs, org_id))
            f.close()
        self.driver.close()
    def parse(self, response):
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op)
        self.driver.implicitly_wait(60)
        
        # Get user_id from URL
        user_id = response.url.split("user="******"&")[0]
        
        # Make request
        self.driver.get(response.url)

        # Get all rows in the table from webdriver response, each paper is in a row
        rows = response.xpath("//table[@id='gsc_rsb_st']//tbody//td[@class='gsc_rsb_std']")

        citations_all = int(rows[0].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0
        citations_s2014 = int(rows[1].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0
        hindex_all = int(rows[2].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0
        hindex_s2014 = int(rows[3].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0
        i10index_all = int(rows[4].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0
        i10index_s2014 = int(rows[5].xpath("text()").get()) if rows[0].xpath("text()").get() is not None else 0

        coauthor_ids = ''
        coauthor_names = ''
        try:
            self.driver.find_element_by_xpath("//button[@id='gsc_coauth_opn']").click()
            coauthors_response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            (coauthor_ids, coauthor_names) = self.parse_coauthors(coauthors_response)
        except:
            pass
        
        output_path = get_path([self.ROOT_DIR, "data", "sample_comparision", "hindex.csv"])
        with open(output_path, "a") as f:
            f.write("\n%s, %d, %d, %d, %d, %d, %d, %s, %s" % (user_id,citations_all,citations_s2014,\
                                                            hindex_all,hindex_s2014,i10index_all,i10index_s2014, \
                                                            coauthor_ids, coauthor_names))
            f.close()
        self.driver.close()

        time.sleep(30)
    def start_requests(self):

        organizations_path = get_path(
            [self.ROOT_DIR, "data", "organizations.txt"])
        organizations_df = pd.read_csv(organizations_path)
        urls = organizations_df['URL'].to_list()

        for url in urls[len(urls) - 1:]:
            yield scrapy.Request(url=url, callback=self.parse)
Example #5
0
    def start_requests(self):
        authors_path = get_path([self.ROOT_DIR, "data", "authors", self.input_file])
        authors_df = pd.read_csv(authors_path)
        
        urls = ('https://scholar.google.com/citations?hl=en&user='******'AuthorID'].astype(str))\
                .to_list()

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    def start_requests(self):
        urls = []
        try:
            if self.input_file == '':
                path = get_path(
                    [self.ROOT_DIR, "data", "papers", self.input_dir, "*.csv"])
                for path_file in glob.glob(path):
                    df = pd.read_csv(path_file)
                    urls.extend(df['Cited_url'].dropna().to_list())
            else:
                path = get_path([
                    self.ROOT_DIR, "data", "papers", self.input_dir,
                    self.input_file
                ])
                urls = pd.read_csv(path)['Cited_url'].dropna().to_list()
        except:
            raise

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
Example #7
0
import pandas as pd

from configuration import ROOT_DIR

from utils.mapping import get_id_orgs, get_name_orgs
from utils.tools import get_path

eurecom_id = get_id_orgs(names=['eurecom'])['eurecom']
auhors_path = get_path([ROOT_DIR, "data", "authors", 'authors_orgID_{}.csv'.format(eurecom_id)])
authors_df = pd.read_csv(auhors_path)
print(authors_df.head(5))
    def parse(self, response):
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path,
                                       options=op)
        self.driver.implicitly_wait(60)

        # Extract organization id
        org_id = response.url.split("org=")[1].split("&")[0]

        # List of authors of the organization
        authors = []
        monitor_url = ''

        # Make request
        self.driver.get(response.url)

        # First page of the request
        page_response = TextResponse(url=response.url,
                                     body=self.driver.page_source,
                                     encoding='utf-8')

        self.parse_one_page(page_response, authors)

        # Click 'Next page' and parse until this button is disabled
        while True:
            try:
                # Wait 15s before next click
                time.sleep(15)
                # Find button 'Next' when it is not hidden
                self.driver.find_element_by_xpath(
                    "//div[@id='gsc_authors_bottom_pag']//button[@aria-label='Next'][not(@disabled)]"
                ).click()
                # Create a variable for response from webdriver
                next_page_response = TextResponse(url=self.driver.current_url,
                                                  body=self.driver.page_source,
                                                  encoding='utf-8')
                self.parse_one_page(next_page_response, authors)

                monitor_url = self.driver.current_url
            except:
                self.driver.close()
                break

        # Output
        if len(authors) > 0:
            csv_path = get_path([
                self.ROOT_DIR, "data", "authors",
                "authors_orgID_{}.csv".format(org_id)
            ])
            write_csv(authors, csv_path)

            pkl_path = get_path([
                self.ROOT_DIR, "data", "authors",
                "authors_orgID_{}.pkl".format(org_id)
            ])
            write_pickle(authors, pkl_path)

            monitor_file = get_path([
                self.ROOT_DIR, "data", "monitors", 'crawled_organizations.txt'
            ])
            monitor_crawler(monitor_file, monitor_url)
Example #9
0
    def parse(self, response):
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path([self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=op)
        self.driver.implicitly_wait(60)
        
        # Extract user_id
        user_id = response.url.split("user="******"&")[0]
        
        # Make request
        self.driver.get(response.url)

        response_after_click = None

        # Click 'Show more' until the button is disabled
        while True:
            try:
                # Wait 15 seconds before next click
                time.sleep(15)
                # Click button 'Show more' while it is not disabled
                self.driver.find_element_by_xpath('//button[@id="gsc_bpf_more"][not(@disabled)]').click()
            except:
                # Get response after clicking completely
                response_after_click = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
                # Close webdriver
                self.driver.close()
                break

        # Create a list of papers for the given user
        papers = []

        # Get all rows in the table from webdriver response, each paper is in a row
        rows = response_after_click.xpath("//table[@id='gsc_a_t']//tr[@class='gsc_a_tr']")

        if (len(rows)>0):
            # In each paper, get title, authors, conference, url of citations, count of citations and year of publication
            for row in rows:
                title = row.xpath(".//td[@class='gsc_a_t']/a/text()").get()
                authors = row.xpath(".//td[@class='gsc_a_t']/div[1]/text()").get()
                conference = row.xpath(".//td[@class='gsc_a_t']/div[2]/text()").get()
                cited_url = row.xpath(".//td[@class='gsc_a_c']/a[@class='gsc_a_ac gs_ibl']/@href").get()
                cited_count = row.xpath(".//td[@class='gsc_a_c']/a/text()").get()
                year = row.xpath(".//td[@class='gsc_a_y']/span/text()").get()

                cited_count = 0 if cited_count is None else int(cited_count)
                year = 0 if year is None else int(year)

                # Create a dictionary for each paper
                paper = {
                    'Title': title,
                    'Authors': authors,
                    'Platform': conference,
                    'Cited_url': cited_url,
                    'Cited_count': cited_count,
                    'Year': year
                }

                papers.append(paper)
            
            # Output
            organization = self.input_file.split('.')[0]
            csv_path = get_path([self.ROOT_DIR, "data", "papers", organization, \
                                "papers-of-authorID-{}.csv".format(user_id)])
            write_csv(papers, csv_path)
            pkl_path = get_path([self.ROOT_DIR, "data", "papers", organization, \
                                "papers-of-authorID-{}.pkl".format(user_id)])
            write_pickle(papers, pkl_path)

            monitor_file = get_path([self.ROOT_DIR, "data", "monitors", 'crawled_{}'.format(self.input_file)])
            monitor_crawler(monitor_file, user_id)
    def parse(self, response):
        # Configure webdriver
        if platform.system() == 'Windows':
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver.exe"])
        else:
            chromedriver_path = get_path(
                [self.ROOT_DIR, "libs", "chromedriver"])
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path,
                                       options=op)
        self.driver.implicitly_wait(60)

        # Get user_id from URL
        user_id = response.url.split("user="******"&")[0]

        # Make request
        self.driver.get(response.url)

        info_tags = response.xpath(
            "//div[@id='gsc_prf_i']//div[@class='gsc_prf_il']")
        info_texts = info_tags.xpath(".//text()").getall()
        info_links = info_tags.xpath(".//a/@href").getall()

        info_texts_str = " | ".join(info_texts)
        info_links_str = " | ".join(info_links)

        # Get all rows in the table from webdriver response, each paper is in a row
        rows = response.xpath(
            "//table[@id='gsc_rsb_st']//tbody//td[@class='gsc_rsb_std']")

        citations_all = int(rows[0].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0
        citations_s2014 = int(rows[1].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0
        hindex_all = int(rows[2].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0
        hindex_s2014 = int(rows[3].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0
        i10index_all = int(rows[4].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0
        i10index_s2014 = int(rows[5].xpath("text()").get()) if rows[0].xpath(
            "text()").get() is not None else 0

        coauthor_ids = ''
        coauthor_names = ''
        try:
            self.driver.find_element_by_xpath(
                "//button[@id='gsc_coauth_opn']").click()
            coauthors_response = TextResponse(url=self.driver.current_url,
                                              body=self.driver.page_source,
                                              encoding='utf-8')
            (coauthor_ids,
             coauthor_names) = self.parse_coauthors(coauthors_response)
        except:
            pass

        info = {
            "AuthorID": user_id,
            "CitationsAll": citations_all,
            "CitationsS2014": citations_s2014,
            "hIndexAll": hindex_all,
            "hIndexS2014": hindex_s2014,
            "i10IndexAll": i10index_all,
            "i10IndexS2014": i10index_s2014,
            "CoAuthorIDs": coauthor_ids,
            "CoAuthorNames": coauthor_names,
            "Description": info_texts_str,
            "DescriptionURLs": info_links_str
        }

        # Output
        csv_path = get_path([
            self.ROOT_DIR, "data", "info",
            "info-{}.csv".format(self.author_file.split(".")[0])
        ])
        with open(csv_path, "a") as f:
            f.write("\n%s, %d, %d, %d, %d, %d, %d, %s, %s, %s, %s" % \
                    (user_id,citations_all,citations_s2014,\
                    hindex_all,hindex_s2014,i10index_all,i10index_s2014, \
                    coauthor_ids, coauthor_names, info_texts_str,info_links_str,))
            f.close()

        monitor_file = get_path([
            self.ROOT_DIR, "data", "monitors",
            'crawled_info_{}'.format(self.author_file)
        ])
        monitor_crawler(monitor_file, user_id)

        self.driver.close()

        time.sleep(30)