Example #1
0
def _get_api_result(
        search: Search,
        start_record: Optional[int] = 0) -> dict:  # pragma: no cover
    """
    This method return results from arXiv database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    start_record : str
        Sequence number of first record to fetch, by default 1

    Returns
    -------
    dict
        a result from arXiv database
    """

    url = _get_search_url(search, start_record)

    return common_util.try_success(
        lambda: xmltodict.parse(DefaultSession().get(url).content),
        2,
        pre_delay=1)
Example #2
0
def _get_paper_metadata(doi: str) -> dict:  # pragma: no cover
    """
    Get a paper metadata from a provided DOI

    Parameters
    ----------
    doi : str
        The paper DOI

    Returns
    -------
    dict
        The ACM paper metadata, or None if there's no metadata available
    """

    form = {
        'dois': doi,
        'targetFile': 'custom-bibtex',
        'format': 'bibTex'
    }

    response = common_util.try_success(lambda: DefaultSession().post(
        f'{BASE_URL}/action/exportCiteProcCitation', data=form).json(), 2)

    if response is not None and response.get('items', None) is not None and len(response.get('items')) > 0:
        return response['items'][0][doi]
def _get_publication_entry(publication_issn: str,
                           api_token: str) -> dict:  # pragma: no cover
    """
    Get publication entry by publication ISSN

    Parameters
    ----------
    publication_issn : str
        A publication ISSN
    api_token : str
        A Scopus API token

    Returns
    -------
    dict (or None)
        publication entry in dict format, or None if the API doesn't return a valid entry
    """

    url = f'{BASE_URL}/content/serial/title/issn/{publication_issn}?apiKey={api_token}'
    headers = {'Accept': 'application/json'}
    response = common_util.try_success(
        lambda: DefaultSession().get(url, headers=headers).json().get(
            'serial-metadata-response', None), 2)

    if response is not None and 'entry' in response and len(
            response.get('entry')) > 0:
        return response.get('entry')[0]
def _get_search_results(search: Search,
                        api_token: str,
                        url: Optional[str] = None) -> dict:  # pragma: no cover
    """
    This method fetch papers from Scopus database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,
    url : Optional[str]
        A predefined URL to be used for the search execution, 
        this is usually used for make the next recursive call on a result pagination
    """

    # is url is not None probably this is a recursive call to the next url of a pagination
    if url is None:
        query = _get_query(search)
        url = f'{BASE_URL}/content/search/scopus?&sort=coverDate&apiKey={api_token}&query={query}'

    headers = {'Accept': 'application/json'}

    return common_util.try_success(
        lambda: DefaultSession().get(url, headers=headers).json()[
            'search-results'], 2)
Example #5
0
def _get_paper_page(url: str) -> html.HtmlElement:  # pragma: no cover
    """
    Get a paper page element from a provided URL

    Parameters
    ----------
    url : str
        The paper URL

    Returns
    -------
    Object
        A HTML element representing the paper given by the provided URL
    """

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
def _get_result(url: str) -> html.HtmlElement:  # pragma: no cover
    """
    This method return results from medRxiv/bioRxiv database using the provided search parameters

    Parameters
    ----------
    url : str
        A URL to search for results

    Returns
    -------
    html.HtmlElement
        a page from medRxiv/bioRxiv database
    """

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
Example #7
0
def _get_paper_entry(pubmed_id: str) -> dict:  # pragma: no cover
    """
    This method return paper data from PubMed database using the provided PubMed ID

    Parameters
    ----------
    pubmed_id : str
        A PubMed ID

    Returns
    -------
    dict
        a paper entry from PubMed database
    """

    url = f'{BASE_URL}/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_id}&rettype=abstract'

    return common_util.try_success(lambda: xmltodict.parse(DefaultSession().get(url).content), 2, pre_delay=1)
Example #8
0
def _get_result(search: Search, start_record: Optional[int] = 0) -> dict:  # pragma: no cover
    """
    This method return results from ACM database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    start_record : str
        Sequence number of first record to fetch, by default 0

    Returns
    -------
    dict
        a result from ACM database
    """

    url = _get_search_url(search, start_record)

    response = common_util.try_success(lambda: DefaultSession().get(url), 2)
    return html.fromstring(response.content)
Example #9
0
def _get_api_result(search: Search, api_token: str, start_record: Optional[int] = 1) -> dict:  # pragma: no cover
    """
    This method return results from IEEE database using the provided search parameters

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,
    start_record : str
        Sequence number of first record to fetch, by default 1

    Returns
    -------
    dict
        a result from IEEE database
    """

    url = _get_search_url(search, api_token, start_record)

    return common_util.try_success(lambda: DefaultSession().get(url).json(), 2)
def _get_paper_metadata(doi: str, database: str) -> dict:  # pragma: no cover
    """
    Get a paper metadata from a provided DOI

    Parameters
    ----------
    doi : str
        The paper DOI
    database : str
        The database name (medRxiv or bioRxiv)

    Returns
    -------
    dict
        The medRxiv/bioRxiv paper metadata, or None if there's no metadata available
    """

    url = f'{API_BASE_URL}/details/{database.lower()}/{doi}'

    response = common_util.try_success(lambda: DefaultSession().get(url).json(), 2)
    if response is not None and response.get('collection', None) is not None and len(response.get('collection')) > 0:
        return response.get('collection')[0]
def _flag_potentially_predatory_publications(search: Search):
    """
    Flag all the potentially predatory publications

    Parameters
    ----------
    search : Search
        A search instance
    """

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) Checking paper: {paper.title}')

        try:

            if paper.publication is not None:
                publication_name = paper.publication.title.lower()
                publisher_name = paper.publication.publisher.lower() if paper.publication.publisher is not None else None
                publisher_host = None
            
                if paper.doi is not None:
                    url = f'http://doi.org/{paper.doi}'
                    response = common_util.try_success(lambda url=url: DefaultSession().get(url), 2)

                    if response is not None:
                        publisher_host = urlparse(response.url).netloc.replace("www.", "")

                if publication_name in publication_util.POTENTIAL_PREDATORY_JOURNALS_NAMES \
                    or publisher_name in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_NAMES \
                    or publisher_host in publication_util.POTENTIAL_PREDATORY_PUBLISHERS_HOSTS:

                    paper.publication.is_potentially_predatory = True

        except Exception:
            pass
Example #12
0
def download(search_path: str,
             output_directory: str,
             only_selected_papers: Optional[bool] = False,
             categories_filter: Optional[dict] = None,
             proxy: Optional[str] = None):
    """
    If you've done your search, (probably made the search refinement too) and wanna download the papers, 
    this is the method that you need to call. This method will try to download the PDF version of the papers to
    the output directory path.

    We use some heuristics to do our job, but sometime they won't work properly, and we cannot be able
    to download the papers, but we logging the downloads or failures in a file download.log
    placed on the output directory, you can check out the log to find what papers cannot be downloaded
    and try to get them manually later. 

    Note: Some papers are behind a paywall and won't be able to be downloaded by this method. 
    However, if you have a proxy provided for the institution where you study or work that permit you 
    to "break" this paywall. You can use this proxy configuration here
    by setting the environment variables FINDPAPERS_HTTP_PROXY and FINDPAPERS_HTTPS_PROXY.

    Parameters
    ----------
    search_path : str
        A valid file path containing a JSON representation of the search results
    output_directory : str
        A valid file path of the directory where the downloaded papers will be placed
    only_selected_papers : bool, False by default
        If only the selected papers will be downloaded
    categories_filter : dict, None by default
        A dict of categories to be used to filter which papers will be downloaded
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None
    """

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy

    search = persistence_util.load(search_path)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    log_filepath = os.path.join(output_directory, 'download.log')

    common_util.check_write_access(log_filepath)

    with open(log_filepath,
              'a' if os.path.exists(log_filepath) else 'w') as fp:
        now = datetime.datetime.now()
        fp.write(
            f"------- A new download process started at: {datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')} \n"
        )

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) {paper.title}')

        if (only_selected_papers and not paper.selected) or \
        (categories_filter is not None and (paper.categories is None or not paper.has_category_match(categories_filter))):
            continue

        downloaded = False
        output_filename = f'{paper.publication_date.year}-{paper.title}'
        output_filename = re.sub(r'[^\w\d-]', '_',
                                 output_filename)  # sanitize filename
        output_filename += '.pdf'
        output_filepath = os.path.join(output_directory, output_filename)

        if os.path.exists(output_filepath):  # PDF already collected
            logging.info(f'Paper\'s PDF file has already been collected')
            continue

        if paper.doi is not None:
            paper.urls.add(f'http://doi.org/{paper.doi}')

        for url in paper.urls:  # we'll try to download the PDF file of the paper by its URLs
            try:
                logging.info(f'Fetching data from: {url}')

                response = common_util.try_success(
                    lambda url=url: DefaultSession().get(url), 2)

                if response is None:
                    continue

                if 'text/html' in response.headers.get('content-type').lower():

                    response_url = urllib.parse.urlsplit(response.url)
                    response_query_string = urllib.parse.parse_qs(
                        urllib.parse.urlparse(response.url).query)
                    response_url_path = response_url.path
                    host_url = f'{response_url.scheme}://{response_url.hostname}'
                    pdf_url = None

                    if response_url_path.endswith('/'):
                        response_url_path = response_url_path[:-1]

                    response_url_path = response_url_path.split('?')[0]

                    if host_url in ['https://dl.acm.org']:

                        doi = paper.doi
                        if doi is None and response_url_path.startswith(
                                '/doi/'
                        ) and '/doi/pdf/' not in response_url_path:
                            doi = response_url_path[4:]
                        elif doi is None:
                            continue

                        pdf_url = f'https://dl.acm.org/doi/pdf/{doi}'

                    elif host_url in ['https://ieeexplore.ieee.org']:

                        if response_url_path.startswith('/document/'):
                            document_id = response_url_path[10:]
                        elif response_query_string.get('arnumber',
                                                       None) is not None:
                            document_id = response_query_string.get(
                                'arnumber')[0]
                        else:
                            continue

                        pdf_url = f'{host_url}/stampPDF/getPDF.jsp?tp=&arnumber={document_id}'

                    elif host_url in [
                            'https://www.sciencedirect.com',
                            'https://linkinghub.elsevier.com'
                    ]:

                        paper_id = response_url_path.split('/')[-1]
                        pdf_url = f'https://www.sciencedirect.com/science/article/pii/{paper_id}/pdfft?isDTMRedir=true&download=true'

                    elif host_url in ['https://pubs.rsc.org']:

                        pdf_url = response.url.replace('/articlelanding/',
                                                       '/articlepdf/')

                    elif host_url in [
                            'https://www.tandfonline.com',
                            'https://www.frontiersin.org'
                    ]:

                        pdf_url = response.url.replace('/full', '/pdf')

                    elif host_url in [
                            'https://pubs.acs.org',
                            'https://journals.sagepub.com',
                            'https://royalsocietypublishing.org'
                    ]:

                        pdf_url = response.url.replace('/doi', '/doi/pdf')

                    elif host_url in ['https://link.springer.com']:

                        pdf_url = response.url.replace(
                            '/article/', '/content/pdf/').replace('%2F',
                                                                  '/') + '.pdf'

                    elif host_url in ['https://www.isca-speech.org']:

                        pdf_url = response.url.replace('/abstracts/',
                                                       '/pdfs/').replace(
                                                           '.html', '.pdf')

                    elif host_url in ['https://onlinelibrary.wiley.com']:

                        pdf_url = response.url.replace('/full/',
                                                       '/pdfdirect/').replace(
                                                           '/abs/',
                                                           '/pdfdirect/')

                    elif host_url in [
                            'https://www.jmir.org', 'https://www.mdpi.com'
                    ]:

                        pdf_url = response.url + '/pdf'

                    elif host_url in ['https://www.pnas.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/pnas/') + '.full.pdf'

                    elif host_url in ['https://www.jneurosci.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/jneuro/') + '.full.pdf'

                    elif host_url in ['https://www.ijcai.org']:

                        paper_id = response.url.split('/')[-1].zfill(4)
                        pdf_url = '/'.join(response.url.split('/')
                                           [:-1]) + '/' + paper_id + '.pdf'

                    elif host_url in [
                            'https://asmp-eurasipjournals.springeropen.com'
                    ]:

                        pdf_url = response.url.replace('/articles/',
                                                       '/track/pdf/')

                    if pdf_url is not None:

                        response = common_util.try_success(
                            lambda url=pdf_url: DefaultSession().get(url), 2)

                if 'application/pdf' in response.headers.get(
                        'content-type').lower():
                    with open(output_filepath, 'wb') as fp:
                        fp.write(response.content)
                    downloaded = True
                    break

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if downloaded:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[DOWNLOADED] {paper.title}\n')
        else:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[FAILED] {paper.title}\n')
                if len(paper.urls) == 0:
                    fp.write(f'Empty URL list\n')
                else:
                    for url in paper.urls:
                        fp.write(f'{url}\n')