def _get_publication(paper_entry: dict) -> Publication: """ Using a paper entry provided, this method builds a publication instance Parameters ---------- paper_entry : dict A paper entry retrieved from PubMed API Returns ------- Publication A publication instance """ article = paper_entry.get('PubmedArticleSet').get( 'PubmedArticle').get('MedlineCitation').get('Article') publication_title = article.get('Journal').get('Title') if publication_title is None or len(publication_title) == 0: return None publication_issn = article.get('Journal').get('ISSN').get('#text') publication = Publication(publication_title, None, publication_issn, None, 'Journal') return publication
def _get_publication(paper_entry: dict) -> Publication: """ Using a paper entry provided, this method builds a publication instance Parameters ---------- paper_entry : dict A paper entry retrieved from IEEE API Returns ------- Publication A publication instance or None """ publication_title = paper_entry.get('publication_title', None) if publication_title is None or len(publication_title) == 0: return None publication_isbn = paper_entry.get('isbn', None) publication_issn = paper_entry.get('issn', None) publication_publisher = paper_entry.get('publisher', None) publication_category = paper_entry.get('content_type', None) publication = Publication(publication_title, publication_isbn, publication_issn, publication_publisher, publication_category) return publication
def to_dict(paper: Paper) -> dict: """ A method that returns a dict object based on the provided Paper instance Parameters ---------- paper : Paper A Paper instance Returns ------- dict A dict that represents a Paper instance """ return { 'title': paper.title, 'abstract': paper.abstract, 'authors': paper.authors, 'publication': Publication.to_dict(paper.publication) if paper.publication is not None else None, 'publication_date': paper.publication_date.strftime('%Y-%m-%d'), 'urls': list(paper.urls), 'doi': paper.doi, 'citations': paper.citations, 'keywords': list(paper.keywords), 'comments': paper.comments, 'number_of_pages': paper.number_of_pages, 'pages': paper.pages, 'databases': list(paper.databases), 'selected': paper.selected, 'categories': paper.categories, }
def _get_publication(paper_entry: dict) -> Publication: """ Using a paper entry provided, this method builds a publication instance Parameters ---------- paper_entry : dict A paper entry retrieved from arXiv API Returns ------- Publication, or None A publication instance """ if 'arxiv:journal_ref' in paper_entry: publication_title = paper_entry.get('arxiv:journal_ref').get('#text') if publication_title is None or len(publication_title) == 0: return None subject_areas = set() if 'category' in paper_entry: if isinstance(paper_entry.get('category'), list): for category in paper_entry.get('category'): subject_area = SUBJECT_AREA_BY_KEY.get( category.get('@term'), None) if subject_area is not None: subject_areas.add(subject_area) else: subject_area = SUBJECT_AREA_BY_KEY.get( paper_entry.get('category').get('@term'), None) if subject_area is not None: subject_areas.add(subject_area) publication = Publication(publication_title, subject_areas=subject_areas) return publication
def _get_publication(paper_entry: dict, api_token: str) -> Publication: """ Using a paper entry provided, this method builds a publication instance Parameters ---------- paper_entry : dict A paper entry retrieved from scopus API api_token : str A Scopus API token Returns ------- Publication A publication instance or None """ # getting data publication_title = paper_entry.get('prism:publicationName', None) if publication_title is None or len(publication_title) == 0: return None publication_isbn = paper_entry.get('prism:isbn', None) publication_issn = paper_entry.get('prism:issn', None) publication_category = paper_entry.get('prism:aggregationType', None) if isinstance(publication_isbn, list): publication_isbn = publication_isbn[0].get('$') if isinstance(publication_issn, list): publication_issn = publication_issn[0].get('$') publication = Publication(publication_title, publication_isbn, publication_issn, None, publication_category) return publication
def from_dict(cls, paper_dict: dict) -> Paper: """ A method that returns a Paper instance based on the provided dict object Parameters ---------- paper_dict : dict A dict that represents a Paper instance Returns ------- Paper A Paper instance based on the provided dict object """ title = paper_dict.get('title') abstract = paper_dict.get('abstract') authors = paper_dict.get('authors') publication = Publication.from_dict( paper_dict.get('publication')) if paper_dict.get( 'publication') is not None else None publication_date = datetime.datetime.strptime( paper_dict.get('publication_date'), '%Y-%m-%d').date() urls = set(paper_dict.get('urls')) doi = paper_dict.get('doi') citations = paper_dict.get('citations') keywords = set(paper_dict.get('keywords')) comments = paper_dict.get('comments') number_of_pages = paper_dict.get('number_of_pages') pages = paper_dict.get('pages') databases = set(paper_dict.get('databases')) selected = paper_dict.get('selected') categories = paper_dict.get('categories') return cls(title, abstract, authors, publication, publication_date, urls, doi, citations, keywords, comments, number_of_pages, pages, databases, selected, categories)
def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_page : html.HtmlElement A paper page retrieved from ACM paper_doi : str The paper DOI paper_url : str The ACM paper URL Returns ------- Paper A paper instance """ paper_abstract = paper_page.xpath( '//*[contains(@class, "abstractSection")]/p')[0].text citation_elements = paper_page.xpath( '//*[contains(@class, "article-metric citation")]//span') paper_citations = None if len(citation_elements) == 1: paper_citations = int(citation_elements[0].text) paper_metadata = _get_paper_metadata(paper_doi) if paper_metadata is None: return None publication = None publication_title = paper_metadata.get('container-title', None) if publication_title is not None and len(publication_title) > 0: publication_isbn = paper_metadata.get('ISBN', None) publication_issn = paper_metadata.get('ISSN', None) publication_publisher = paper_metadata.get('publisher', None) publication_category = paper_metadata.get('type', None) publication = Publication(publication_title, publication_isbn, publication_issn, publication_publisher, publication_category) paper_title = paper_metadata.get('title', None) if paper_title is None or len(paper_title) == 0: return None paper_authors = paper_metadata.get('author', []) paper_authors = ['{} {}'.format( x.get('given'), x.get('family')) for x in paper_authors] paper_publication_date = None if paper_metadata.get('issued', None) != None: date_parts = paper_metadata['issued']['date-parts'][0] if len(date_parts) == 1: # only year paper_publication_date = datetime.date(date_parts[0], 1, 1) else: paper_publication_date = datetime.date( date_parts[0], date_parts[1], date_parts[2]) if paper_publication_date is None: return None paper_keywords = set() if paper_metadata.get('keyword', None) is not None: paper_keywords = set([x.strip() for x in paper_metadata['keyword'].split(',')]) paper_pages = paper_metadata.get('page', None) if paper_pages is not None: paper_pages = paper_pages.replace('\u2013', '-') paper_number_of_pages = paper_metadata.get('number-of-pages', None) if paper_number_of_pages is not None: paper_number_of_pages = int(paper_number_of_pages) if paper_doi is None: paper_doi = paper_metadata.get('DOI') paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, {paper_url}, paper_doi, paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages) return paper
def publication(): return Publication('awesome publication title', 'isbn-X', 'issn-X', 'that publisher', 'Journal')
def _enrich(search: Search, scopus_api_token: Optional[str] = None): """ Private method that enriches the search results based on paper metadata Parameters ---------- search : Search A search instance scopus_api_token : Optional[str], optional A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None """ for i, paper in enumerate(search.papers): logging.info(f'({i+1}/{len(search.papers)}) Enriching paper: {paper.title}') try: urls = set() if paper.doi is not None: urls.add(f'http://doi.org/{paper.doi}') else: urls = copy.copy(paper.urls) for url in urls: if 'pdf' in url: # trying to skip PDF links continue paper_metadata, paper_url = _get_paper_metadata_by_url(url) if paper_metadata is not None and 'citation_title' in paper_metadata: # when some paper data is present on page's metadata, force to use it. In most of the cases this data is more relyable paper_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_title') if paper_title is None or len(paper_title.strip()) == 0: continue paper.title = paper_title paper_doi = _force_single_metadata_value_by_key(paper_metadata, 'citation_doi') if paper_doi is not None and len(paper_doi.strip()) > 0: paper.doi = paper_doi paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'citation_abstract') if paper_abstract is None: paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'DC.Description') if paper_abstract is None: paper_abstract = _force_single_metadata_value_by_key(paper_metadata, 'description') if paper_abstract is not None and len(paper_abstract.strip()) > 0: paper.abstract = paper_abstract paper_authors = paper_metadata.get('citation_author', None) if paper_authors is not None and not isinstance(paper_authors, list): # there is only one author paper_authors = [paper_authors] if paper_authors is not None and len(paper_authors) > 0: paper.authors = paper_authors paper_keywords = _force_single_metadata_value_by_key(paper_metadata, 'citation_keywords') if paper_keywords is None or len(paper_keywords.strip()) > 0: paper_keywords = _force_single_metadata_value_by_key(paper_metadata, 'keywords') if paper_keywords is not None and len(paper_keywords.strip()) > 0: if ',' in paper_keywords: paper_keywords = paper_keywords.split(',') elif ';' in paper_keywords: paper_keywords = paper_keywords.split(';') paper_keywords = set([x.strip() for x in paper_keywords]) if paper_keywords is not None and len(paper_keywords) > 0: paper.keywords = paper_keywords publication = None publication_title = None publication_category = None if 'citation_journal_title' in paper_metadata: publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_journal_title') publication_category = 'Journal' elif 'citation_conference_title' in paper_metadata: publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_conference_title') publication_category = 'Conference Proceedings' elif 'citation_book_title' in paper_metadata: publication_title = _force_single_metadata_value_by_key(paper_metadata, 'citation_book_title') publication_category = 'Book' if publication_title is not None and len(publication_title) > 0 and publication_title.lower() not in ['biorxiv', 'medrxiv', 'arxiv']: publication_issn = _force_single_metadata_value_by_key(paper_metadata, 'citation_issn') publication_isbn = _force_single_metadata_value_by_key(paper_metadata, 'citation_isbn') publication_publisher = _force_single_metadata_value_by_key(paper_metadata, 'citation_publisher') publication = Publication(publication_title, publication_isbn, publication_issn, publication_publisher, publication_category) if paper.publication is None: paper.publication = publication else: paper.publication.enrich(publication) paper_pdf_url = _force_single_metadata_value_by_key(paper_metadata, 'citation_pdf_url') if paper_pdf_url is not None: paper.add_url(paper_pdf_url) except Exception: # pragma: no cover pass if scopus_api_token is not None: try: scopus_searcher.enrich_publication_data(search, scopus_api_token) except Exception: # pragma: no cover logging.debug( 'Error while fetching data from Scopus database', exc_info=True)
def test_publication(publication: Publication): assert publication.title == 'awesome publication title' assert publication.isbn == 'isbn-X' assert publication.issn == 'issn-X' assert publication.publisher == 'that publisher' assert publication.category == 'Journal' publication.category = 'book series' assert publication.category == 'Book' publication.category = 'journal article' assert publication.category == 'Journal' publication.category = 'Conference' assert publication.category == 'Conference Proceedings' publication.category = 'newspaper article' assert publication.category == None another_publication = Publication('awesome publication title 2') another_publication.cite_score = 1.0 another_publication.sjr = 2.0 another_publication.snip = 3.0 another_publication.subject_areas = {'area A'} publication.issn = None publication.isbn = None publication.publisher = None publication.category = None publication.subject_areas = set() publication.enrich(another_publication) assert publication.cite_score == another_publication.cite_score assert publication.sjr == another_publication.sjr assert publication.snip == another_publication.snip assert publication.issn == another_publication.issn assert publication.isbn == another_publication.isbn assert publication.publisher == another_publication.publisher assert publication.category == another_publication.category assert 'area A' in publication.subject_areas