Beispiel #1
0
def run(search: Search, api_token: str):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    papers_count = 0
    result = _get_api_result(search, api_token)
    total_papers = result.get('total_records')

    logging.info(f'IEEE: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for paper_entry in result.get('articles'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1

            try:

                logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}')

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, api_token, papers_count+1)
def run(search: Search, database: str):
    """
    This method fetch papers from medRxiv/bioRxiv database using the provided search parameters
    After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    database : str
        The database name (medRxiv or bioRxiv)
    """

    urls = _get_search_urls(search, database)

    for i, url in enumerate(urls):

        if search.reached_its_limit(database):
            break

        logging.info(f'{database}: Requesting for papers...')

        data = _get_data(url)

        total_papers = 0
        if len(data) > 0:
            total_papers = data[0].get('total_papers')

        logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests')

        papers_count = 0
        dois = sum([d.get('dois') for d in [x for x in data]], [])

        for doi in dois:
            if papers_count >= total_papers or search.reached_its_limit(database):
                break
            try:
                papers_count += 1
                paper_metadata = _get_paper_metadata(doi, database)

                paper_title = paper_metadata.get('title')
                
                logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}')
                
                paper = _get_paper(paper_metadata)
                
                paper.add_database(database)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)
Beispiel #3
0
def test_save_and_load(search: Search, paper: Paper):

    temp_dirpath = tempfile.mkdtemp()
    temp_filepath = os.path.join(temp_dirpath, 'output.json')

    search.add_paper(paper)

    findpapers.save(search, temp_filepath)

    loaded_search = findpapers.load(temp_filepath)

    assert loaded_search.query == search.query
    assert loaded_search.since == search.since
    assert loaded_search.until == search.until
    assert loaded_search.limit == search.limit
    assert loaded_search.limit_per_database == search.limit_per_database
    assert loaded_search.processed_at.strftime(
        '%Y-%m-%d %H:%M:%S') == search.processed_at.strftime(
            '%Y-%m-%d %H:%M:%S')
    assert len(loaded_search.papers) == len(search.papers)
Beispiel #4
0
def run(search: Search):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if search.publication_types is not None and 'journal' not in search.publication_types:
        logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.')
        return

    papers_count = 0
    result = _get_api_result(search)

    if result.get('eSearchResult').get('ErrorList', None) is not None:
        total_papers = 0
    else:
        total_papers = int(result.get('eSearchResult').get('Count'))
    
    logging.info(f'PubMed: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1
            
            try:

                paper_entry = _get_paper_entry(pubmed_id)

                if paper_entry is not None:

                    paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get(
                        'MedlineCitation').get('Article').get('ArticleTitle')

                    logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}')

                    publication = _get_publication(paper_entry)
                    paper = _get_paper(paper_entry, publication)

                    if paper is not None:
                        paper.add_database(DATABASE_LABEL)
                        search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, papers_count)
Beispiel #5
0
def run(search: Search):
    """
    This method fetch papers from ACM database using the provided search parameters
    After fetch the data from ACM, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    """

    papers_count = 0
    result = _get_result(search)

    try:
        total_papers = int(result.xpath(
            '//*[@class="hitsLength"]')[0].text.strip())
    except Exception:  # pragma: no cover
        total_papers = 0

    logging.info(f'ACM: {total_papers} papers to fetch')

    page_index = 0
    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        papers_urls = [BASE_URL+x.attrib['href']
                       for x in result.xpath('//*[@class="hlFld-Title"]/a')]

        for paper_url in papers_urls:

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break

            try:
                papers_count += 1

                paper_page = _get_paper_page(paper_url)

                paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text

                logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}')
                
                paper_doi = None
                if '/abs/' in paper_url:
                    paper_doi = paper_url.split('/abs/')[1]
                elif '/book/' in paper_url:
                    paper_doi = paper_url.split('/book/')[1]
                else:
                    paper_doi = paper_url.split('/doi/')[1]

                paper = _get_paper(paper_page, paper_doi, paper_url)

                if paper is None:
                    continue
                
                paper.add_database(DATABASE_LABEL)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            page_index += 1
            result = _get_result(search, page_index)
Beispiel #6
0
def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0):
    """
    This method fetch papers from Scopus database using the provided search parameters
    After fetch the data from Scopus, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,
    url : Optional[str]
        A predefined URL to be used for the search execution, 
        this is usually used for make the next recursive call on a result pagination
    papers_count : Optional[int]
        Papers count used on recursion calls

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    search_results = _get_search_results(search, api_token, url)

    total_papers = int(search_results.get('opensearch:totalResults', 0))

    logging.info(f'Scopus: {total_papers} papers to fetch')

    for paper_entry in search_results.get('entry', []):

        if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
            break

        papers_count += 1

        try:

            paper_title = paper_entry.get("dc:title")
            logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}')

            publication = _get_publication(paper_entry, api_token)
            paper = _get_paper(paper_entry, publication)

            if paper is not None:
                paper.add_database(DATABASE_LABEL)
                search.add_paper(paper)

        except Exception as e:  # pragma: no cover
            logging.debug(e, exc_info=True)

    next_url = None
    for link in search_results['link']:
        if link['@ref'] == 'next':
            next_url = link['@href']
            break

    # If there is a next url, the API provided response was paginated and we need to process the next url
    # We'll make a recursive call for it
    if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL):
        run(search, api_token, next_url, papers_count)
Beispiel #7
0
def run(search: Search):
    """
    This method fetch papers from arXiv database using the provided search parameters
    After fetch the data from arXiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance

    """

    papers_count = 0
    result = _get_api_result(search)

    total_papers = int(
        result.get('feed').get('opensearch:totalResults').get('#text'))

    logging.info(f'arXiv: {total_papers} papers to fetch')

    while (papers_count < total_papers
           and not search.reached_its_limit(DATABASE_LABEL)):

        entries = result.get('feed', {}).get('entry', [])
        if type(
                entries
        ) != list:  # if there's only one entry the result is not a list just a dict
            entries = [entries]

        for paper_entry in entries:

            if papers_count >= total_papers or search.reached_its_limit(
                    DATABASE_LABEL):
                break

            papers_count += 1

            try:

                paper_title = paper_entry.get("title")
                logging.info(
                    f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}'
                )

                published_date = datetime.datetime.strptime(
                    paper_entry.get('published')[:10], '%Y-%m-%d').date()

                # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves'
                if search.since is not None and published_date < search.since:
                    logging.info(
                        'Skipping paper due to "since" date constraint')
                    continue
                elif search.until is not None and published_date > search.until:
                    logging.info(
                        'Skipping paper due to "until" date constraint')
                    continue

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, published_date, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(
                DATABASE_LABEL):
            time.sleep(1)  # sleep for 1 second to avoid server blocking
            result = _get_api_result(search, papers_count)
Beispiel #8
0
def test_output(search: Search, paper: Paper):

    paper.publication.category = 'Journal'
    paper.categories = {'Facet A': ['Category A', 'Category B']}
    paper.selected = False
    search.add_paper(paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-CONF'
    other_paper.publication.category = 'Conference Proceedings'
    other_paper.title = 'Conference paper title'
    other_paper.doi = 'fake-doi-conference-paper'
    other_paper.selected = True
    other_paper.categories = {
        'Facet A': ['Category C'],
        'Facet B': ['Category 1']
    }
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-BOOK'
    other_paper.publication.category = 'Book'
    other_paper.title = 'Book paper title'
    other_paper.doi = 'fake-doi-book-paper'
    other_paper.categories = None
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication = None
    other_paper.title = 'Unpublished paper title'
    other_paper.doi = None
    other_paper.selected = True
    other_paper.categories = {'Facet A': ['Category A']}
    search.add_paper(other_paper)

    search_path = tempfile.NamedTemporaryFile().name
    outputpath = tempfile.NamedTemporaryFile().name

    persistence_util.save(search, search_path)

    findpapers.generate_bibtex(search_path, outputpath)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    article_header = '@article{drpaul1969awesome'
    inproceedings_header = '@inproceedings{drpaul1969conference'
    book_header = '@book{drpaul1969book'
    unpublished = '@unpublished{drpaul1969unpublished'

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               only_selected_papers=True)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header not in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               categories_filter={
                                   'Facet A': ['Category A'],
                                   'Facet B': ['Category 1']
                               })
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(
        search_path,
        outputpath,
        categories_filter={'Facet A': ['Category B', 'Category C']})
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished not in generated_bibtex
def test_search(paper: Paper):

    paper.doi = None

    search = Search('this AND that', datetime.date(1969, 1, 30),
                    datetime.date(1970, 4, 8), 2)

    assert len(search.papers) == 0

    search.add_paper(paper)
    assert len(search.papers) == 1
    search.add_paper(paper)
    assert len(search.papers) == 1

    another_paper = Paper('awesome paper title 2', 'a long abstract',
                          paper.authors, paper.publication,
                          paper.publication_date, paper.urls)
    another_paper.add_database('arXiv')

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    assert paper == search.get_paper(paper.title, paper.publication_date,
                                     paper.doi)
    assert paper.publication == search.get_publication(paper.publication.title,
                                                       paper.publication.issn,
                                                       paper.publication.isbn)

    search.remove_paper(another_paper)
    assert len(search.papers) == 1
    assert paper in search.papers

    search.limit_per_database = 1
    with pytest.raises(OverflowError):
        search.add_paper(another_paper)
    search.limit_per_database = 2

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    another_paper_2 = copy.deepcopy(paper)
    another_paper_2.title = 'awesome paper title 3'
    another_paper_2.abstract = 'a long abstract'
    another_paper_2.databases = set()

    with pytest.raises(ValueError):
        search.add_paper(another_paper_2)

    another_paper_2.add_database('arXiv')

    with pytest.raises(OverflowError):
        search.add_paper(another_paper_2)

    search.merge_duplications()
    assert len(search.papers) == 1

    publication_title = 'FAKE-TITLE'
    publication_issn = 'FAKE-ISSN'
    publication_isbn = 'FAKE-ISBN'
    assert search.get_publication_key(
        publication_title, publication_issn,
        publication_isbn) == f'ISBN-{publication_isbn.lower()}'
    assert search.get_publication_key(
        publication_title,
        publication_issn) == f'ISSN-{publication_issn.lower()}'
    assert search.get_publication_key(
        publication_title) == f'TITLE-{publication_title.lower()}'