def _get_api_result( search: Search, start_record: Optional[int] = 0) -> dict: # pragma: no cover """ This method return results from arXiv database using the provided search parameters Parameters ---------- search : Search A search instance start_record : str Sequence number of first record to fetch, by default 1 Returns ------- dict a result from arXiv database """ url = _get_search_url(search, start_record) return common_util.try_success( lambda: xmltodict.parse(DefaultSession().get(url).content), 2, pre_delay=1)
def _get_paper_metadata(doi: str) -> dict: # pragma: no cover """ Get a paper metadata from a provided DOI Parameters ---------- doi : str The paper DOI Returns ------- dict The ACM paper metadata, or None if there's no metadata available """ form = { 'dois': doi, 'targetFile': 'custom-bibtex', 'format': 'bibTex' } response = common_util.try_success(lambda: DefaultSession().post( f'{BASE_URL}/action/exportCiteProcCitation', data=form).json(), 2) if response is not None and response.get('items', None) is not None and len(response.get('items')) > 0: return response['items'][0][doi]
def _get_publication_entry(publication_issn: str, api_token: str) -> dict: # pragma: no cover """ Get publication entry by publication ISSN Parameters ---------- publication_issn : str A publication ISSN api_token : str A Scopus API token Returns ------- dict (or None) publication entry in dict format, or None if the API doesn't return a valid entry """ url = f'{BASE_URL}/content/serial/title/issn/{publication_issn}?apiKey={api_token}' headers = {'Accept': 'application/json'} response = common_util.try_success( lambda: DefaultSession().get(url, headers=headers).json().get( 'serial-metadata-response', None), 2) if response is not None and 'entry' in response and len( response.get('entry')) > 0: return response.get('entry')[0]
def _get_search_results(search: Search, api_token: str, url: Optional[str] = None) -> dict: # pragma: no cover """ This method fetch papers from Scopus database using the provided search parameters Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from Scopus database, url : Optional[str] A predefined URL to be used for the search execution, this is usually used for make the next recursive call on a result pagination """ # is url is not None probably this is a recursive call to the next url of a pagination if url is None: query = _get_query(search) url = f'{BASE_URL}/content/search/scopus?&sort=coverDate&apiKey={api_token}&query={query}' headers = {'Accept': 'application/json'} return common_util.try_success( lambda: DefaultSession().get(url, headers=headers).json()[ 'search-results'], 2)
def _get_paper_page(url: str) -> html.HtmlElement: # pragma: no cover """ Get a paper page element from a provided URL Parameters ---------- url : str The paper URL Returns ------- Object A HTML element representing the paper given by the provided URL """ response = common_util.try_success(lambda: DefaultSession().get(url), 2) return html.fromstring(response.content)
def _get_result(url: str) -> html.HtmlElement: # pragma: no cover """ This method return results from medRxiv/bioRxiv database using the provided search parameters Parameters ---------- url : str A URL to search for results Returns ------- html.HtmlElement a page from medRxiv/bioRxiv database """ response = common_util.try_success(lambda: DefaultSession().get(url), 2) return html.fromstring(response.content)
def _get_paper_entry(pubmed_id: str) -> dict: # pragma: no cover """ This method return paper data from PubMed database using the provided PubMed ID Parameters ---------- pubmed_id : str A PubMed ID Returns ------- dict a paper entry from PubMed database """ url = f'{BASE_URL}/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_id}&rettype=abstract' return common_util.try_success(lambda: xmltodict.parse(DefaultSession().get(url).content), 2, pre_delay=1)
def _get_result(search: Search, start_record: Optional[int] = 0) -> dict: # pragma: no cover """ This method return results from ACM database using the provided search parameters Parameters ---------- search : Search A search instance start_record : str Sequence number of first record to fetch, by default 0 Returns ------- dict a result from ACM database """ url = _get_search_url(search, start_record) response = common_util.try_success(lambda: DefaultSession().get(url), 2) return html.fromstring(response.content)
def _get_api_result(search: Search, api_token: str, start_record: Optional[int] = 1) -> dict: # pragma: no cover """ This method return results from IEEE database using the provided search parameters Parameters ---------- search : Search A search instance api_token : str The API key used to fetch data from IEEE database, start_record : str Sequence number of first record to fetch, by default 1 Returns ------- dict a result from IEEE database """ url = _get_search_url(search, api_token, start_record) return common_util.try_success(lambda: DefaultSession().get(url).json(), 2)
def _get_paper_metadata(doi: str, database: str) -> dict: # pragma: no cover """ Get a paper metadata from a provided DOI Parameters ---------- doi : str The paper DOI database : str The database name (medRxiv or bioRxiv) Returns ------- dict The medRxiv/bioRxiv paper metadata, or None if there's no metadata available """ url = f'{API_BASE_URL}/details/{database.lower()}/{doi}' response = common_util.try_success(lambda: DefaultSession().get(url).json(), 2) if response is not None and response.get('collection', None) is not None and len(response.get('collection')) > 0: return response.get('collection')[0]
def _flag_potentially_predatory_publications(search: Search): """ Flag all the potentially predatory publications Parameters ---------- search : Search A search instance """ for i, paper in enumerate(search.papers): logging.info(f'({i+1}/{len(search.papers)}) Checking paper: {paper.title}') try: if paper.publication is not None: publication_name = paper.publication.title.lower() publisher_name = paper.publication.publisher.lower() if paper.publication.publisher is not None else None publisher_host = None if paper.doi is not None: url = f'http://doi.org/{paper.doi}' response = common_util.try_success(lambda url=url: DefaultSession().get(url), 2) if response is not None: publisher_host = urlparse(response.url).netloc.replace("www.", "") if publication_name in publication_util.POTENTIAL_PREDATORY_JOURNALS_NAMES \ or publisher_name in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_NAMES \ or publisher_host in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_HOSTS: paper.publication.is_potentially_predatory = True except Exception: pass
def download(search_path: str, output_directory: str, only_selected_papers: Optional[bool] = False, categories_filter: Optional[dict] = None, proxy: Optional[str] = None): """ If you've done your search, (probably made the search refinement too) and wanna download the papers, this is the method that you need to call. This method will try to download the PDF version of the papers to the output directory path. We use some heuristics to do our job, but sometime they won't work properly, and we cannot be able to download the papers, but we logging the downloads or failures in a file download.log placed on the output directory, you can check out the log to find what papers cannot be downloaded and try to get them manually later. Note: Some papers are behind a paywall and won't be able to be downloaded by this method. However, if you have a proxy provided for the institution where you study or work that permit you to "break" this paywall. You can use this proxy configuration here by setting the environment variables FINDPAPERS_HTTP_PROXY and FINDPAPERS_HTTPS_PROXY. Parameters ---------- search_path : str A valid file path containing a JSON representation of the search results output_directory : str A valid file path of the directory where the downloaded papers will be placed only_selected_papers : bool, False by default If only the selected papers will be downloaded categories_filter : dict, None by default A dict of categories to be used to filter which papers will be downloaded proxy : Optional[str], optional proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None """ if proxy is not None: os.environ['FINDPAPERS_PROXY'] = proxy search = persistence_util.load(search_path) if not os.path.exists(output_directory): os.makedirs(output_directory) log_filepath = os.path.join(output_directory, 'download.log') common_util.check_write_access(log_filepath) with open(log_filepath, 'a' if os.path.exists(log_filepath) else 'w') as fp: now = datetime.datetime.now() fp.write( f"------- A new download process started at: {datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')} \n" ) for i, paper in enumerate(search.papers): logging.info(f'({i+1}/{len(search.papers)}) {paper.title}') if (only_selected_papers and not paper.selected) or \ (categories_filter is not None and (paper.categories is None or not paper.has_category_match(categories_filter))): continue downloaded = False output_filename = f'{paper.publication_date.year}-{paper.title}' output_filename = re.sub(r'[^\w\d-]', '_', output_filename) # sanitize filename output_filename += '.pdf' output_filepath = os.path.join(output_directory, output_filename) if os.path.exists(output_filepath): # PDF already collected logging.info(f'Paper\'s PDF file has already been collected') continue if paper.doi is not None: paper.urls.add(f'http://doi.org/{paper.doi}') for url in paper.urls: # we'll try to download the PDF file of the paper by its URLs try: logging.info(f'Fetching data from: {url}') response = common_util.try_success( lambda url=url: DefaultSession().get(url), 2) if response is None: continue if 'text/html' in response.headers.get('content-type').lower(): response_url = urllib.parse.urlsplit(response.url) response_query_string = urllib.parse.parse_qs( urllib.parse.urlparse(response.url).query) response_url_path = response_url.path host_url = f'{response_url.scheme}://{response_url.hostname}' pdf_url = None if response_url_path.endswith('/'): response_url_path = response_url_path[:-1] response_url_path = response_url_path.split('?')[0] if host_url in ['https://dl.acm.org']: doi = paper.doi if doi is None and response_url_path.startswith( '/doi/' ) and '/doi/pdf/' not in response_url_path: doi = response_url_path[4:] elif doi is None: continue pdf_url = f'https://dl.acm.org/doi/pdf/{doi}' elif host_url in ['https://ieeexplore.ieee.org']: if response_url_path.startswith('/document/'): document_id = response_url_path[10:] elif response_query_string.get('arnumber', None) is not None: document_id = response_query_string.get( 'arnumber')[0] else: continue pdf_url = f'{host_url}/stampPDF/getPDF.jsp?tp=&arnumber={document_id}' elif host_url in [ 'https://www.sciencedirect.com', 'https://linkinghub.elsevier.com' ]: paper_id = response_url_path.split('/')[-1] pdf_url = f'https://www.sciencedirect.com/science/article/pii/{paper_id}/pdfft?isDTMRedir=true&download=true' elif host_url in ['https://pubs.rsc.org']: pdf_url = response.url.replace('/articlelanding/', '/articlepdf/') elif host_url in [ 'https://www.tandfonline.com', 'https://www.frontiersin.org' ]: pdf_url = response.url.replace('/full', '/pdf') elif host_url in [ 'https://pubs.acs.org', 'https://journals.sagepub.com', 'https://royalsocietypublishing.org' ]: pdf_url = response.url.replace('/doi', '/doi/pdf') elif host_url in ['https://link.springer.com']: pdf_url = response.url.replace( '/article/', '/content/pdf/').replace('%2F', '/') + '.pdf' elif host_url in ['https://www.isca-speech.org']: pdf_url = response.url.replace('/abstracts/', '/pdfs/').replace( '.html', '.pdf') elif host_url in ['https://onlinelibrary.wiley.com']: pdf_url = response.url.replace('/full/', '/pdfdirect/').replace( '/abs/', '/pdfdirect/') elif host_url in [ 'https://www.jmir.org', 'https://www.mdpi.com' ]: pdf_url = response.url + '/pdf' elif host_url in ['https://www.pnas.org']: pdf_url = response.url.replace( '/content/', '/content/pnas/') + '.full.pdf' elif host_url in ['https://www.jneurosci.org']: pdf_url = response.url.replace( '/content/', '/content/jneuro/') + '.full.pdf' elif host_url in ['https://www.ijcai.org']: paper_id = response.url.split('/')[-1].zfill(4) pdf_url = '/'.join(response.url.split('/') [:-1]) + '/' + paper_id + '.pdf' elif host_url in [ 'https://asmp-eurasipjournals.springeropen.com' ]: pdf_url = response.url.replace('/articles/', '/track/pdf/') if pdf_url is not None: response = common_util.try_success( lambda url=pdf_url: DefaultSession().get(url), 2) if 'application/pdf' in response.headers.get( 'content-type').lower(): with open(output_filepath, 'wb') as fp: fp.write(response.content) downloaded = True break except Exception as e: # pragma: no cover logging.debug(e, exc_info=True) if downloaded: with open(log_filepath, 'a') as fp: fp.write(f'[DOWNLOADED] {paper.title}\n') else: with open(log_filepath, 'a') as fp: fp.write(f'[FAILED] {paper.title}\n') if len(paper.urls) == 0: fp.write(f'Empty URL list\n') else: for url in paper.urls: fp.write(f'{url}\n')