コード例 #1
0
    def parse_ARXIV(self, response):
        """
        Takes the ArXiv web page, gets the .pdf URL and calls the function print_url

        :param response: Response object containing the ArXiv page dedicated to the BKC paper
        """
        
        pdf_css = '.full-text a::attr(href)'
        pdf_file = response.css(pdf_css).extract_first()
        print_url(self, response, pdf_file, self.name.upper())
        if (not self.print_only) and (pdf_file is not None):
            self.files_to_download['file_urls'] = [response.urljoin(pdf_file)]
            self.files_to_download['title'] = response.meta['title']
            return self.files_to_download
コード例 #2
0
    def parse_SSRN(self, response):                                       # Parse the SSRN page to obtain the paper link
        """
        Takes the SSRN (Social Science Research Network) web page, gets the .pdf URL and calls the function print_url

        :param response: Response object containing the SSRN (Social Science Research Network) page dedicated to the BKC
                         paper
        """
        
        pdf_css = '.download-button::attr(href)'
        pdf_file = response.css(pdf_css).extract_first()
        print_url(self, response, pdf_file, self.name.upper())
        if (not self.print_only) and (pdf_file is not None):
            self.files_to_download['file_urls'] = [response.urljoin(pdf_file) + '&download=yes']
            self.files_to_download['title'] = response.meta['title']
            return self.files_to_download
コード例 #3
0
    def parse_bkc(self, response):                                               # Parse a BKC paper page
        """
        Takes the page dedicated to a single publication on the BKC website and either continues crawling to a
        publications repository or downloads the pdf (if present on the page itself)

        :param response: Response object containing the BKC page dedicated to a single publication
        :return: Request object containing the publications repository page dedicated to the current publication
        """
        website_css = '.c-detail__nav a::attr(href)'
        title_css = 'meta[name=title]::attr(content)'
        meta = dict()
        meta['title'] = response.css(title_css).extract_first()
        links = response.css(website_css).extract()
        found = False
        parser = None
        if links is not None:
            for link in links:
                if 'ssrn' in link:
                    parser = self.parse_SSRN
                    found = True
                    break
                elif 'dash' in link:
                    parser = self.parse_DASH
                    found = True
                    break
                elif 'arxiv' in link:
                    parser = self.parse_ARXIV
                    found = True
                    break
                elif '.pdf' in link:
                    found = True
                    break
        if self.testing:
            return found

        if found:
            if parser is not None:
                self.files_to_download[link] = dict()
                return response.follow(link.replace('&download=yes', ''), parser, meta=meta)
            else:
                print_url(self, response, link, self.name.upper())
                if not self.print_only:
                    self.files_to_download = dict()
                    self.files_to_download['file_urls'] = [response.urljoin(link)]
                    self.files_to_download['title'] = meta['title']
                    return self.files_to_download
        else:
            print_url(self, response, None, self.name.upper())
コード例 #4
0
    def parse_DASH(self, response):                                       # Parse the DASH page to obtain the paper link
        """
        Takes the DASH (Digital Access to Scholarship at Harvard) web page, gets the .pdf URL and calls the
        print_url function

        :param response: Response object containing the DASH (Digital Access to Scholarship at Harvard) page dedicated
        to the BKC paper
        """
        
        pdf_css = '.dash-item-download a::attr(href)'
        pdf_file = response.css(pdf_css).extract_first()
        print_url(self, response, pdf_file, self.name.upper())
        if (not self.print_only) and (pdf_file is not None):
            self.files_to_download['file_urls'] = [response.urljoin(pdf_file)]
            self.files_to_download['title'] = response.meta['title']
            return self.files_to_download
コード例 #5
0
    def parse_isp(self, response):
        """"
        Takes the page dedicated to a single publication on the ISP website, gets the pdf URL (if present on the page)
        and calls the function print_url

        :param response: Response object containing the ISP page dedicated to a single publication
        """
        pdf_css = 'a[href$=".pdf"]::attr(href)'
        pdf_files = response.css(pdf_css).extract()

        for pdf_file in pdf_files:
            if self.testing:
                yield Response(url=pdf_file)
            else:
                print_url(self, response, pdf_file, self.name.upper())
                if (not self.print_only) and (pdf_file is not None):
                    yield {"file_urls": [response.urljoin(pdf_file)]}
コード例 #6
0
    def parse_inc(self, response):
        """"
        Takes the page dedicated to a single publication on the INC website, gets the pdf URL (if present on the page)
        and calls the function print_url

        :param response: Response object containing the INC page dedicated to a single publication
        """
        pdf_css = 'a[href$=".pdf"]::attr(href)'
        pdf_file = response.css(pdf_css).extract_first()  # Parse paper page

        if not pdf_file:
            pdf_css = '.pwk-link::attr(href)'
            pdf_file = response.css(
                pdf_css).extract_first()  # Parse paper page

        if not self.testing:
            print_url(self, response, pdf_file, self.name.upper())

            if (not self.print_only) and (pdf_file is not None):
                return {"file_urls": [response.urljoin(pdf_file)]}
        return False if pdf_file is None else True