def _get_paper(paper_metadata: dict) -> Paper: """ Get Paper object from metadata Parameters ---------- paper_metadata : dict Paper metadata Returns ------- Paper Paper object """ paper_title = paper_metadata.get('title') paper_abstract = paper_metadata.get('abstract') paper_authors = [x.strip() for x in paper_metadata.get('authors').split(';')] publication = None paper_publication_date = datetime.datetime.strptime(paper_metadata.get('date'), '%Y-%m-%d').date() paper_url = f'https://doi.org/{paper_metadata.get("doi")}' paper_doi = paper_metadata.get("doi") paper_citations = None paper_keywords = None paper_comments = None paper_number_of_pages = None paper_pages = None if paper_metadata.get('published').lower() != 'na': paper_doi = paper_metadata.get('published').replace('\\', '') return Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, {paper_url}, paper_doi, paper_citations, paper_keywords, paper_comments, paper_number_of_pages, paper_pages)
def _get_paper(paper_entry: dict, paper_publication_date: datetime.date, publication: Publication) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_entry : dict A paper entry retrieved from arXiv API paper_publication_date : datetime.date The paper publication date publication : Publication A publication instance that will be associated with the paper Returns ------- Paper A paper instance """ paper_title = paper_entry.get('title', None) if paper_title is None or len(paper_title) == 0: return None paper_title = paper_title.replace('\n', '') paper_title = re.sub(' +', ' ', paper_title) paper_doi = paper_entry.get('arxiv:doi').get( '#text') if 'arxiv:doi' in paper_entry else None paper_abstract = paper_entry.get('summary', None) paper_urls = set() paper_authors = [] if 'link' in paper_entry: if isinstance(paper_entry.get('link'), list): for link in paper_entry.get('link'): paper_urls.add(link.get('@href')) else: paper_urls.add(paper_entry.get('link').get('@href')) if 'author' in paper_entry: if isinstance(paper_entry.get('author'), list): for author in paper_entry.get('author'): paper_authors.append(author.get('name')) else: paper_authors.append(paper_entry.get('author').get('name')) paper_comments = paper_entry.get('arxiv:comment', {}).get('#text', None) paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, paper_urls, paper_doi, comments=paper_comments) return paper
def paper(publication): title = 'awesome paper title' abstract = 'a long abstract' authors = ['Dr Paul', 'Dr John', 'Dr George', 'Dr Ringo'] publication_date = datetime.date(1969, 1, 30) paper_url = "https://en.wikipedia.org/wiki/The_Beatles'_rooftop_concert" urls = {paper_url} doi = 'fake-doi' citations = 25 keywords = {'term A', 'term B'} comments = 'some comments' number_of_pages = 4 pages = '1-4' databases = {'arXiv', 'ACM', 'IEEE', 'PubMed', 'Scopus'} selected = True categories = {'Facet A': ['Category A', 'Category B']} paper = Paper(title, abstract, authors, publication, publication_date, urls, doi, citations, keywords, comments, number_of_pages, pages, databases, selected, categories) return paper
def _get_paper(paper_entry: dict, publication: Publication) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_entry : dict A paper entry retrieved from IEEE API publication : Publication A publication instance that will be associated with the paper Returns ------- Paper A paper instance or None """ article = paper_entry.get('PubmedArticleSet').get( 'PubmedArticle').get('MedlineCitation').get('Article') paper_title = article.get('ArticleTitle', None) if paper_title is None or len(paper_title) == 0: return None paper_title = paper_title if isinstance(paper_title, str) else paper_title.get('#text') if 'ArticleDate' in article: paper_publication_date_day = article.get('ArticleDate').get('Day') paper_publication_date_month = article.get('ArticleDate').get('Month') paper_publication_date_year = article.get('ArticleDate').get('Year') else: paper_publication_date_day = 1 paper_publication_date_month = common_util.get_numeric_month_by_string( article.get('Journal').get('JournalIssue').get('PubDate').get('Month')) paper_publication_date_year = article.get('Journal').get( 'JournalIssue').get('PubDate').get('Year') paper_doi = None paper_ids = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get( 'PubmedData').get('ArticleIdList').get('ArticleId') for paper_id in paper_ids: if paper_id.get('@IdType') == 'doi': paper_doi = paper_id.get('#text') break paper_abstract = None paper_abstract_entry = article.get('Abstract', {}).get('AbstractText', None) if paper_abstract_entry is None: raise ValueError('Paper abstract is empty') if isinstance(paper_abstract_entry, list): paper_abstract = '\n'.join( [x.get('#text') for x in paper_abstract_entry if x.get('#text') is not None]) else: paper_abstract = paper_abstract_entry if isinstance(paper_abstract_entry, str) else paper_abstract_entry.get('#text') try: paper_keywords = set([x.get('#text').strip() for x in paper_entry.get('PubmedArticleSet').get( 'PubmedArticle').get('MedlineCitation').get('KeywordList').get('Keyword')]) except Exception: paper_keywords = set() try: paper_publication_date = datetime.date(int(paper_publication_date_year), int( paper_publication_date_month), int(paper_publication_date_day)) except Exception: paper_publication_date = datetime.date( int(paper_publication_date_year), 1, 1) if paper_publication_date is None: return None paper_authors = [] retrived_authors = [] if isinstance(article.get('AuthorList').get('Author'), dict): # only one author retrived_authors = [article.get('AuthorList').get('Author')] else: retrived_authors = article.get('AuthorList').get('Author') for author in retrived_authors: if isinstance(author, str): paper_authors.append(author) elif isinstance(author, dict): paper_authors.append(f"{author.get('ForeName')} {author.get('LastName')}") paper_pages = None paper_number_of_pages = None try: paper_pages = article.get('Pagination').get('MedlinePgn') if not paper_pages.isdigit(): # if it's a digit, the paper pages range is invalid pages_split = paper_pages.split('-') paper_number_of_pages = abs(int(pages_split[0])-int(pages_split[1]))+1 except Exception: # pragma: no cover pass paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, set(), paper_doi, None, paper_keywords, None, paper_number_of_pages, paper_pages) return paper
def _get_paper(paper_entry: dict, publication: Publication) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_entry : dict A paper entry retrieved from IEEE API publication : Publication A publication instance that will be associated with the paper Returns ------- Paper A paper instance or None """ paper_title = paper_entry.get('title', None) if paper_title is None or len(paper_title) == 0: return None paper_publication_date = paper_entry.get('publication_date', None) paper_doi = paper_entry.get('doi', None) paper_citations = paper_entry.get('citing_paper_count', None) paper_abstract = paper_entry.get('abstract', None) paper_urls = {paper_entry.get('pdf_url')} paper_pages = None paper_number_of_pages = None try: paper_keywords = set([ x.strip() for x in paper_entry.get( 'index_terms').get('author_terms').get('terms')]) except Exception as e: paper_keywords = set() if paper_publication_date is not None: try: paper_publication_date_split = paper_publication_date.split(' ') day = int(paper_publication_date_split[0].split('-')[0]) month = int(common_util.get_numeric_month_by_string( paper_publication_date_split[1])) year = int(paper_publication_date_split[2]) paper_publication_date = datetime.date(year, month, day) except Exception as e: pass if not isinstance(paper_publication_date, datetime.date): paper_publication_date = datetime.date( paper_entry.get('publication_year'), 1, 1) if paper_publication_date is None: return None paper_authors = [] for author in paper_entry.get('authors').get('authors'): paper_authors.append(author.get('full_name')) start_page = paper_entry.get('start_page', None) end_page = paper_entry.get('end_page', None) if start_page is not None and end_page is not None: try: paper_pages = f"{paper_entry.get('start_page')}-{paper_entry.get('end_page')}" paper_number_of_pages = abs( int(paper_entry.get('start_page'))-int(paper_entry.get('end_page')))+1 except Exception: # pragma: no cover pass paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, paper_urls, paper_doi, paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages) return paper
def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_page : html.HtmlElement A paper page retrieved from ACM paper_doi : str The paper DOI paper_url : str The ACM paper URL Returns ------- Paper A paper instance """ paper_abstract = paper_page.xpath( '//*[contains(@class, "abstractSection")]/p')[0].text citation_elements = paper_page.xpath( '//*[contains(@class, "article-metric citation")]//span') paper_citations = None if len(citation_elements) == 1: paper_citations = int(citation_elements[0].text) paper_metadata = _get_paper_metadata(paper_doi) if paper_metadata is None: return None publication = None publication_title = paper_metadata.get('container-title', None) if publication_title is not None and len(publication_title) > 0: publication_isbn = paper_metadata.get('ISBN', None) publication_issn = paper_metadata.get('ISSN', None) publication_publisher = paper_metadata.get('publisher', None) publication_category = paper_metadata.get('type', None) publication = Publication(publication_title, publication_isbn, publication_issn, publication_publisher, publication_category) paper_title = paper_metadata.get('title', None) if paper_title is None or len(paper_title) == 0: return None paper_authors = paper_metadata.get('author', []) paper_authors = ['{} {}'.format( x.get('given'), x.get('family')) for x in paper_authors] paper_publication_date = None if paper_metadata.get('issued', None) != None: date_parts = paper_metadata['issued']['date-parts'][0] if len(date_parts) == 1: # only year paper_publication_date = datetime.date(date_parts[0], 1, 1) else: paper_publication_date = datetime.date( date_parts[0], date_parts[1], date_parts[2]) if paper_publication_date is None: return None paper_keywords = set() if paper_metadata.get('keyword', None) is not None: paper_keywords = set([x.strip() for x in paper_metadata['keyword'].split(',')]) paper_pages = paper_metadata.get('page', None) if paper_pages is not None: paper_pages = paper_pages.replace('\u2013', '-') paper_number_of_pages = paper_metadata.get('number-of-pages', None) if paper_number_of_pages is not None: paper_number_of_pages = int(paper_number_of_pages) if paper_doi is None: paper_doi = paper_metadata.get('DOI') paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, {paper_url}, paper_doi, paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages) return paper
def _get_paper(paper_entry: dict, publication: Publication) -> Paper: """ Using a paper entry provided, this method builds a paper instance Parameters ---------- paper_entry : dict A paper entry retrieved from scopus API publication : Publication A publication instance that will be associated with the paper Returns ------- Paper A paper instance or None """ # getting data paper_title = paper_entry.get('dc:title', None) if paper_title is None or len(paper_title) == 0: return None paper_publication_date = paper_entry.get('prism:coverDate', None) paper_doi = paper_entry.get('prism:doi', None) paper_citations = paper_entry.get('citedby-count', None) paper_first_author = paper_entry.get('dc:creator', None) paper_abstract = None paper_authors = [] paper_urls = set() paper_keywords = set() paper_pages = None paper_number_of_pages = None # post processing data if paper_first_author is not None: paper_authors.append(paper_first_author) if paper_publication_date is not None: date_split = paper_publication_date.split('-') paper_publication_date = datetime.date( int(date_split[0]), int(date_split[1]), int(date_split[2])) if paper_publication_date is None: return None if paper_citations is not None: paper_citations = int(paper_citations) # enriching data paper_scopus_link = None for link in paper_entry.get('link', []): if link.get('@ref') == 'scopus': paper_scopus_link = link.get('@href') break if paper_scopus_link is not None: paper_urls.add(paper_scopus_link) try: paper_page = _get_paper_page(paper_scopus_link) paper_abstract = paper_page.xpath( '//section[@id="abstractSection"]//p//text()[normalize-space()]') if len(paper_abstract) > 0: paper_abstract = re.sub( '\xa0', ' ', ''.join(paper_abstract)).strip() authors = paper_page.xpath( '//*[@id="authorlist"]/ul/li/span[@class="previewTxt"]') paper_authors = [] for author in authors: paper_authors.append(author.text.strip()) keywords = paper_page.xpath('//*[@id="authorKeywords"]/span') for keyword in keywords: paper_keywords.add(keyword.text.strip()) try: paper_pages = paper_page.xpath( '//span[@id="journalInfo"]')[0].text.split('Pages')[1].strip() if paper_pages.isdigit(): # pragma: no cover paper_number_of_pages = 1 else: pages_split = paper_pages.split('-') paper_number_of_pages = abs( int(pages_split[0])-int(pages_split[1]))+1 except Exception: # pragma: no cover pass except Exception as e: logging.debug(e, exc_info=True) paper = Paper(paper_title, paper_abstract, paper_authors, publication, paper_publication_date, paper_urls, paper_doi, paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages) return paper
def test_paper(paper: Paper): assert paper.title == 'awesome paper title' assert paper.abstract == 'a long abstract' assert paper.authors == ['Dr Paul', 'Dr John', 'Dr George', 'Dr Ringo'] assert len(paper.urls) == 1 assert len(paper.databases) == 5 paper.databases = set() with pytest.raises(ValueError): paper.add_database('INVALID DATABASE') paper.add_database('Scopus') paper.add_database('Scopus') assert len(paper.databases) == 1 paper.add_database('ACM') assert len(paper.databases) == 2 assert len(paper.urls) == 1 paper.add_url(next(iter(paper.urls))) assert len(paper.urls) == 1 paper.add_url('another://url') assert len(paper.urls) == 2 another_paper_citations = 10 another_doi = 'DOI-X' another_keywords = {'key-A', 'key-B', 'key-C'} another_comments = 'some comments' another_paper = Paper('another awesome title paper', 'a long abstract', paper.authors, paper.publication, paper.publication_date, paper.urls, another_doi, another_paper_citations, another_keywords, another_comments) another_paper.add_database('arXiv') paper.publication_date = None paper.abstract = None paper.authors = None paper.keywords = None paper.publication = None paper.doi = None paper.citations = 0 paper.comments = None paper.number_of_pages = None paper.pages = None paper.enrich(another_paper) assert paper.publication_date == another_paper.publication_date assert paper.abstract == another_paper.abstract assert paper.authors == another_paper.authors assert paper.keywords == another_paper.keywords assert 'arXiv' in paper.databases assert len(paper.databases) == 3 assert paper.doi == another_doi assert paper.citations == another_paper_citations # 'cause another_paper_citations was higher than paper_citations assert paper.keywords == another_keywords assert paper.comments == another_comments
def test_search(paper: Paper): paper.doi = None search = Search('this AND that', datetime.date(1969, 1, 30), datetime.date(1970, 4, 8), 2) assert len(search.papers) == 0 search.add_paper(paper) assert len(search.papers) == 1 search.add_paper(paper) assert len(search.papers) == 1 another_paper = Paper('awesome paper title 2', 'a long abstract', paper.authors, paper.publication, paper.publication_date, paper.urls) another_paper.add_database('arXiv') search.add_paper(another_paper) assert len(search.papers) == 2 assert paper == search.get_paper(paper.title, paper.publication_date, paper.doi) assert paper.publication == search.get_publication(paper.publication.title, paper.publication.issn, paper.publication.isbn) search.remove_paper(another_paper) assert len(search.papers) == 1 assert paper in search.papers search.limit_per_database = 1 with pytest.raises(OverflowError): search.add_paper(another_paper) search.limit_per_database = 2 search.add_paper(another_paper) assert len(search.papers) == 2 another_paper_2 = copy.deepcopy(paper) another_paper_2.title = 'awesome paper title 3' another_paper_2.abstract = 'a long abstract' another_paper_2.databases = set() with pytest.raises(ValueError): search.add_paper(another_paper_2) another_paper_2.add_database('arXiv') with pytest.raises(OverflowError): search.add_paper(another_paper_2) search.merge_duplications() assert len(search.papers) == 1 publication_title = 'FAKE-TITLE' publication_issn = 'FAKE-ISSN' publication_isbn = 'FAKE-ISBN' assert search.get_publication_key( publication_title, publication_issn, publication_isbn) == f'ISBN-{publication_isbn.lower()}' assert search.get_publication_key( publication_title, publication_issn) == f'ISSN-{publication_issn.lower()}' assert search.get_publication_key( publication_title) == f'TITLE-{publication_title.lower()}'