Example #1
0
def _get_publication(paper_entry: dict) -> Publication:
    """
    Using a paper entry provided, this method builds a publication instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from PubMed API

    Returns
    -------
    Publication
        A publication instance
    """

    article = paper_entry.get('PubmedArticleSet').get(
        'PubmedArticle').get('MedlineCitation').get('Article')

    publication_title = article.get('Journal').get('Title')

    if publication_title is None or len(publication_title) == 0:
        return None

    publication_issn = article.get('Journal').get('ISSN').get('#text')

    publication = Publication(publication_title, None,
                              publication_issn, None, 'Journal')

    return publication
Example #2
0
def _get_publication(paper_entry: dict) -> Publication:
    """
    Using a paper entry provided, this method builds a publication instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from IEEE API

    Returns
    -------
    Publication
        A publication instance or None
    """

    publication_title = paper_entry.get('publication_title', None)

    if publication_title is None or len(publication_title) == 0:
        return None

    publication_isbn = paper_entry.get('isbn', None)
    publication_issn = paper_entry.get('issn', None)
    publication_publisher = paper_entry.get('publisher', None)
    publication_category = paper_entry.get('content_type', None)

    publication = Publication(publication_title, publication_isbn,
                              publication_issn, publication_publisher, publication_category)

    return publication
Example #3
0
    def to_dict(paper: Paper) -> dict:
        """
        A method that returns a dict object based on the provided Paper instance

        Parameters
        ----------
        paper : Paper
            A Paper instance

        Returns
        -------
        dict
            A dict that represents a Paper instance
        """

        return {
            'title':
            paper.title,
            'abstract':
            paper.abstract,
            'authors':
            paper.authors,
            'publication':
            Publication.to_dict(paper.publication)
            if paper.publication is not None else None,
            'publication_date':
            paper.publication_date.strftime('%Y-%m-%d'),
            'urls':
            list(paper.urls),
            'doi':
            paper.doi,
            'citations':
            paper.citations,
            'keywords':
            list(paper.keywords),
            'comments':
            paper.comments,
            'number_of_pages':
            paper.number_of_pages,
            'pages':
            paper.pages,
            'databases':
            list(paper.databases),
            'selected':
            paper.selected,
            'categories':
            paper.categories,
        }
Example #4
0
def _get_publication(paper_entry: dict) -> Publication:
    """
    Using a paper entry provided, this method builds a publication instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from arXiv API

    Returns
    -------
    Publication, or None
        A publication instance
    """

    if 'arxiv:journal_ref' in paper_entry:

        publication_title = paper_entry.get('arxiv:journal_ref').get('#text')

        if publication_title is None or len(publication_title) == 0:
            return None

        subject_areas = set()

        if 'category' in paper_entry:
            if isinstance(paper_entry.get('category'), list):
                for category in paper_entry.get('category'):
                    subject_area = SUBJECT_AREA_BY_KEY.get(
                        category.get('@term'), None)
                    if subject_area is not None:
                        subject_areas.add(subject_area)
            else:
                subject_area = SUBJECT_AREA_BY_KEY.get(
                    paper_entry.get('category').get('@term'), None)
                if subject_area is not None:
                    subject_areas.add(subject_area)

        publication = Publication(publication_title,
                                  subject_areas=subject_areas)

        return publication
Example #5
0
def _get_publication(paper_entry: dict, api_token: str) -> Publication:
    """
    Using a paper entry provided, this method builds a publication instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from scopus API
    api_token : str
        A Scopus API token

    Returns
    -------
    Publication
        A publication instance or None
    """

    # getting data

    publication_title = paper_entry.get('prism:publicationName', None)

    if publication_title is None or len(publication_title) == 0:
        return None

    publication_isbn = paper_entry.get('prism:isbn', None)
    publication_issn = paper_entry.get('prism:issn', None)
    publication_category = paper_entry.get('prism:aggregationType', None)

    if isinstance(publication_isbn, list):
        publication_isbn = publication_isbn[0].get('$')

    if isinstance(publication_issn, list):
        publication_issn = publication_issn[0].get('$')

    publication = Publication(publication_title, publication_isbn,
                              publication_issn, None, publication_category)

    return publication
Example #6
0
    def from_dict(cls, paper_dict: dict) -> Paper:
        """
        A method that returns a Paper instance based on the provided dict object

        Parameters
        ----------
        paper_dict : dict
            A dict that represents a Paper instance

        Returns
        -------
        Paper
            A Paper instance based on the provided dict object
        """

        title = paper_dict.get('title')
        abstract = paper_dict.get('abstract')
        authors = paper_dict.get('authors')
        publication = Publication.from_dict(
            paper_dict.get('publication')) if paper_dict.get(
                'publication') is not None else None
        publication_date = datetime.datetime.strptime(
            paper_dict.get('publication_date'), '%Y-%m-%d').date()
        urls = set(paper_dict.get('urls'))
        doi = paper_dict.get('doi')
        citations = paper_dict.get('citations')
        keywords = set(paper_dict.get('keywords'))
        comments = paper_dict.get('comments')
        number_of_pages = paper_dict.get('number_of_pages')
        pages = paper_dict.get('pages')
        databases = set(paper_dict.get('databases'))
        selected = paper_dict.get('selected')
        categories = paper_dict.get('categories')

        return cls(title, abstract, authors, publication, publication_date,
                   urls, doi, citations, keywords, comments, number_of_pages,
                   pages, databases, selected, categories)
Example #7
0
def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_page : html.HtmlElement
        A paper page retrieved from ACM
    paper_doi : str
        The paper DOI
    paper_url : str
        The ACM paper URL

    Returns
    -------
    Paper
        A paper instance
    """

    paper_abstract = paper_page.xpath(
        '//*[contains(@class, "abstractSection")]/p')[0].text

    citation_elements = paper_page.xpath(
        '//*[contains(@class, "article-metric citation")]//span')
    paper_citations = None
    if len(citation_elements) == 1:
        paper_citations = int(citation_elements[0].text)

    paper_metadata = _get_paper_metadata(paper_doi)

    if paper_metadata is None:
        return None

    publication = None
    publication_title = paper_metadata.get('container-title', None)

    if publication_title is not None and len(publication_title) > 0:

        publication_isbn = paper_metadata.get('ISBN', None)
        publication_issn = paper_metadata.get('ISSN', None)
        publication_publisher = paper_metadata.get('publisher', None)
        publication_category = paper_metadata.get('type', None)

        publication = Publication(publication_title, publication_isbn,
                                publication_issn, publication_publisher, publication_category)

    paper_title = paper_metadata.get('title', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_authors = paper_metadata.get('author', [])
    paper_authors = ['{} {}'.format(
        x.get('given'), x.get('family')) for x in paper_authors]

    paper_publication_date = None
    if paper_metadata.get('issued', None) != None:
        date_parts = paper_metadata['issued']['date-parts'][0]
        if len(date_parts) == 1:  # only year
            paper_publication_date = datetime.date(date_parts[0], 1, 1)
        else:
            paper_publication_date = datetime.date(
                date_parts[0], date_parts[1], date_parts[2])

    if paper_publication_date is None:
        return None

    paper_keywords = set()
    if paper_metadata.get('keyword', None) is not None:
        paper_keywords = set([x.strip()
                              for x in paper_metadata['keyword'].split(',')])

    paper_pages = paper_metadata.get('page', None)
    if paper_pages is not None:
        paper_pages = paper_pages.replace('\u2013', '-')

    paper_number_of_pages = paper_metadata.get('number-of-pages', None)
    if paper_number_of_pages is not None:
        paper_number_of_pages = int(paper_number_of_pages)

    if paper_doi is None:
        paper_doi = paper_metadata.get('DOI')

    paper = Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, {paper_url}, paper_doi,
                  paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages)

    return paper
Example #8
0
def publication():
    return Publication('awesome publication title', 'isbn-X', 'issn-X', 'that publisher', 'Journal')
def _enrich(search: Search, scopus_api_token: Optional[str] = None):
    """
    Private method that enriches the search results based on paper metadata

    Parameters
    ----------
    search : Search
        A search instance
    scopus_api_token : Optional[str], optional
        A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None
    """

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) Enriching paper: {paper.title}')

        try:

            urls = set()
            if paper.doi is not None:
                urls.add(f'http://doi.org/{paper.doi}')
            else:
                urls = copy.copy(paper.urls)

            for url in urls:

                if 'pdf' in url: # trying to skip PDF links
                    continue

                paper_metadata, paper_url = _get_paper_metadata_by_url(url)

                if paper_metadata is not None and 'citation_title' in paper_metadata:

                    # when some paper data is present on page's metadata, force to use it. In most of the cases this data is more relyable

                    paper_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_title')
                    
                    if paper_title is None or len(paper_title.strip()) == 0:
                        continue

                    paper.title = paper_title

                    paper_doi = _force_single_metadata_value_by_key(paper_metadata, 'citation_doi')
                    if paper_doi is not None and len(paper_doi.strip()) > 0:
                        paper.doi = paper_doi

                    paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'citation_abstract')
                    if paper_abstract is None:
                        paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'DC.Description')
                    if paper_abstract is None:
                        paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'description')

                    if paper_abstract is not None and len(paper_abstract.strip()) > 0:
                        paper.abstract = paper_abstract

                    paper_authors = paper_metadata.get('citation_author', None)
                    if paper_authors is not None and not isinstance(paper_authors, list): # there is only one author
                        paper_authors = [paper_authors]

                    if paper_authors is not None and len(paper_authors) > 0:
                        paper.authors = paper_authors

                    paper_keywords = _force_single_metadata_value_by_key(paper_metadata, 'citation_keywords')
                    if paper_keywords is None or len(paper_keywords.strip()) > 0:
                        paper_keywords = _force_single_metadata_value_by_key(paper_metadata, 'keywords')

                    if paper_keywords is not None and len(paper_keywords.strip()) > 0:
                        if ',' in paper_keywords:
                            paper_keywords = paper_keywords.split(',')
                        elif ';' in paper_keywords:
                            paper_keywords = paper_keywords.split(';')
                        paper_keywords = set([x.strip() for x in paper_keywords])

                    if paper_keywords is not None and len(paper_keywords) > 0:
                        paper.keywords = paper_keywords
                    
                    publication = None
                    publication_title = None
                    publication_category = None
                    if 'citation_journal_title' in paper_metadata:
                        publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_journal_title')
                        publication_category = 'Journal'
                    elif 'citation_conference_title' in paper_metadata:
                        publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_conference_title')
                        publication_category = 'Conference Proceedings'
                    elif 'citation_book_title' in paper_metadata:
                        publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_book_title')
                        publication_category = 'Book'

                    if publication_title is not None and len(publication_title) > 0 and publication_title.lower() not in ['biorxiv', 'medrxiv', 'arxiv']:
                    
                        publication_issn = _force_single_metadata_value_by_key(paper_metadata, 'citation_issn')
                        publication_isbn = _force_single_metadata_value_by_key(paper_metadata, 'citation_isbn')
                        publication_publisher = _force_single_metadata_value_by_key(paper_metadata, 'citation_publisher')

                        publication = Publication(publication_title, publication_isbn, publication_issn, publication_publisher, publication_category)
                        
                        if paper.publication is None:
                            paper.publication = publication
                        else:
                            paper.publication.enrich(publication)

                    paper_pdf_url = _force_single_metadata_value_by_key(paper_metadata, 'citation_pdf_url')
                    
                    if paper_pdf_url is not None: 
                        paper.add_url(paper_pdf_url)

        except Exception:  # pragma: no cover
            pass

    if scopus_api_token is not None:

        try:
            scopus_searcher.enrich_publication_data(search, scopus_api_token)
        except Exception:  # pragma: no cover
            logging.debug(
                'Error while fetching data from Scopus database', exc_info=True)
Example #10
0
def test_publication(publication: Publication):

    assert publication.title == 'awesome publication title'
    assert publication.isbn == 'isbn-X'
    assert publication.issn == 'issn-X'
    assert publication.publisher == 'that publisher'
    assert publication.category == 'Journal'

    publication.category = 'book series'
    assert publication.category == 'Book'

    publication.category = 'journal article'
    assert publication.category == 'Journal'

    publication.category = 'Conference'
    assert publication.category == 'Conference Proceedings'

    publication.category = 'newspaper article'
    assert publication.category == None

    another_publication = Publication('awesome publication title 2')
    another_publication.cite_score = 1.0
    another_publication.sjr = 2.0
    another_publication.snip = 3.0
    another_publication.subject_areas = {'area A'}

    publication.issn = None
    publication.isbn = None
    publication.publisher = None
    publication.category = None
    publication.subject_areas = set()

    publication.enrich(another_publication)

    assert publication.cite_score == another_publication.cite_score
    assert publication.sjr == another_publication.sjr
    assert publication.snip == another_publication.snip
    assert publication.issn == another_publication.issn
    assert publication.isbn == another_publication.isbn
    assert publication.publisher == another_publication.publisher
    assert publication.category == another_publication.category
    assert 'area A' in publication.subject_areas