def parse_ARXIV(self, response): """ Takes the ArXiv web page, gets the .pdf URL and calls the function print_url :param response: Response object containing the ArXiv page dedicated to the BKC paper """ pdf_css = '.full-text a::attr(href)' pdf_file = response.css(pdf_css).extract_first() print_url(self, response, pdf_file, self.name.upper()) if (not self.print_only) and (pdf_file is not None): self.files_to_download['file_urls'] = [response.urljoin(pdf_file)] self.files_to_download['title'] = response.meta['title'] return self.files_to_download
def parse_SSRN(self, response): # Parse the SSRN page to obtain the paper link """ Takes the SSRN (Social Science Research Network) web page, gets the .pdf URL and calls the function print_url :param response: Response object containing the SSRN (Social Science Research Network) page dedicated to the BKC paper """ pdf_css = '.download-button::attr(href)' pdf_file = response.css(pdf_css).extract_first() print_url(self, response, pdf_file, self.name.upper()) if (not self.print_only) and (pdf_file is not None): self.files_to_download['file_urls'] = [response.urljoin(pdf_file) + '&download=yes'] self.files_to_download['title'] = response.meta['title'] return self.files_to_download
def parse_bkc(self, response): # Parse a BKC paper page """ Takes the page dedicated to a single publication on the BKC website and either continues crawling to a publications repository or downloads the pdf (if present on the page itself) :param response: Response object containing the BKC page dedicated to a single publication :return: Request object containing the publications repository page dedicated to the current publication """ website_css = '.c-detail__nav a::attr(href)' title_css = 'meta[name=title]::attr(content)' meta = dict() meta['title'] = response.css(title_css).extract_first() links = response.css(website_css).extract() found = False parser = None if links is not None: for link in links: if 'ssrn' in link: parser = self.parse_SSRN found = True break elif 'dash' in link: parser = self.parse_DASH found = True break elif 'arxiv' in link: parser = self.parse_ARXIV found = True break elif '.pdf' in link: found = True break if self.testing: return found if found: if parser is not None: self.files_to_download[link] = dict() return response.follow(link.replace('&download=yes', ''), parser, meta=meta) else: print_url(self, response, link, self.name.upper()) if not self.print_only: self.files_to_download = dict() self.files_to_download['file_urls'] = [response.urljoin(link)] self.files_to_download['title'] = meta['title'] return self.files_to_download else: print_url(self, response, None, self.name.upper())
def parse_DASH(self, response): # Parse the DASH page to obtain the paper link """ Takes the DASH (Digital Access to Scholarship at Harvard) web page, gets the .pdf URL and calls the print_url function :param response: Response object containing the DASH (Digital Access to Scholarship at Harvard) page dedicated to the BKC paper """ pdf_css = '.dash-item-download a::attr(href)' pdf_file = response.css(pdf_css).extract_first() print_url(self, response, pdf_file, self.name.upper()) if (not self.print_only) and (pdf_file is not None): self.files_to_download['file_urls'] = [response.urljoin(pdf_file)] self.files_to_download['title'] = response.meta['title'] return self.files_to_download
def parse_isp(self, response): """" Takes the page dedicated to a single publication on the ISP website, gets the pdf URL (if present on the page) and calls the function print_url :param response: Response object containing the ISP page dedicated to a single publication """ pdf_css = 'a[href$=".pdf"]::attr(href)' pdf_files = response.css(pdf_css).extract() for pdf_file in pdf_files: if self.testing: yield Response(url=pdf_file) else: print_url(self, response, pdf_file, self.name.upper()) if (not self.print_only) and (pdf_file is not None): yield {"file_urls": [response.urljoin(pdf_file)]}
def parse_inc(self, response): """" Takes the page dedicated to a single publication on the INC website, gets the pdf URL (if present on the page) and calls the function print_url :param response: Response object containing the INC page dedicated to a single publication """ pdf_css = 'a[href$=".pdf"]::attr(href)' pdf_file = response.css(pdf_css).extract_first() # Parse paper page if not pdf_file: pdf_css = '.pwk-link::attr(href)' pdf_file = response.css( pdf_css).extract_first() # Parse paper page if not self.testing: print_url(self, response, pdf_file, self.name.upper()) if (not self.print_only) and (pdf_file is not None): return {"file_urls": [response.urljoin(pdf_file)]} return False if pdf_file is None else True