def test_query_by_pid_without_cited_by(self): article = Article(fixtures.article) mock_load_article_title_keys = self.mocker.replace( controller.load_article) mock_load_article_title_keys(ANY, ANY) self.mocker.result(article) mock_coll = self.mocker.mock() mock_coll.find(ANY, ANY) self.mocker.result(None) self.mocker.replay() expected = { 'article': { 'code': u'S0101-31222002000100038', 'title': u'Estratégias de luta das enfermeiras da Maternidade Leila Diniz para implantação de um modelo humanizado de assistência ao parto', 'issn': u'0101-3122', 'source': u'Revista Brasileira de Sementes', 'url': u'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0101-31222002000100038&lng=en&tlng=en' }, 'cited_by': None } self.assertEqual( controller.query_by_pid(mock_coll, 'S0101-31222002000100038'), expected)
def load_documents(articlemeta_db, collection, all_records=False): fltr = {'collection': collection} if all_records is False: fltr['license'] = {'$exists': 0} documents = articlemeta_db['articles'].find(fltr, {'code': 1}) pids = [] for document in documents: pids.append(document['code']) if 'license' in fltr: del (fltr['license']) for pid in pids: fltr['code'] = pid document = articlemeta_db['articles'].find_one(fltr, { '_id': 0, 'citations': 0 }) yield Article(document) documents.close() # Release the cursor once it has no timeout.
def load_documents(collection, all_records=False): fltr = {'collection': collection} if all_records is False: fltr['fulltexts'] = {'$exists': 0} documents = articlemeta_db['articles'].find(fltr, {'code': 1}) pids = [] for document in documents: pids.append(document['code']) if 'fulltexts' in fltr: del (fltr['fulltexts']) for pid in pids: fltr['code'] = pid document = articlemeta_db['articles'].find_one(fltr, { '_id': 0, 'citations': 0 }) yield Article(document) documents.close()
def test_should_return_none_if_no_document_dates(self): metadata = { "article": {"v65": [{"_": "19970300"}],}, } article = Article(metadata) document_pubdate, __ = conversion.get_article_dates(article) self.assertIsNone(document_pubdate)
def test_xmlarticle_meta_keywords_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'title': {} }) pxml = ET.Element('records') pxml.append(ET.Element('record')) data = [fakexylosearticle, pxml] xmlarticle = export_doaj.XMLArticleMetaKeywordsPipe() raw, xml = xmlarticle.transform(data) try: xml.find('./record/keywords').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def document(self, code, collection, replace_journal_metadata=True, fmt='xylose'): try: article = self.client.get_article(code=code, collection=collection, replace_journal_metadata=True, fmt=fmt) except: msg = 'Error retrieving document: %s_%s' % (collection, code) raise ServerError(msg) if fmt == 'xylose': jarticle = None try: jarticle = json.loads(article) except: msg = 'Fail to load JSON when retrienving document: %s_%s' % ( collection, code) raise ServerError(msg) if not jarticle: logger.warning('Document not found for : %s_%s' % (collection, code)) return None xarticle = Article(jarticle) logger.info('Document loaded: %s_%s' % (collection, code)) return xarticle logger.info('Document loaded: %s_%s' % (collection, code)) return article
def test_should_return_document_publication_date_if_it_is_presente(self): metadata = { "article": {"v65": [{"_": "19970300"}], "v223": [{"_": "20200124"}],}, } article = Article(metadata) document_pubdate, __ = conversion.get_article_dates(article) self.assertEqual(document_pubdate, ("2020", "01", "24"))
def test_xml_article_body_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'title': {} }) pxml = ET.Element('article') data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLBodyPipe() raw, xml = xmlarticle.transform(data) try: xml.find('./body/p').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def test_xmlarticle_meta_article_categories_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'section': { u'pt': u'label pt', u'es': u'label es' } }) pxml = ET.Element('article') pxml.append(ET.Element('front')) front = pxml.find('front') front.append(ET.Element('article-meta')) data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLArticleMetaArticleCategoriesPipe() raw, xml = xmlarticle.transform(data) categories = xml.find( './front/article-meta/article-categories/subj-group[@subj-group-type="heading"]/subject' ).text self.assertEqual(u'label pt', categories)
def test_xml_article_meta_article_id_doi_without_data_pipe(self): fakexylosearticle = Article({'article': {}, 'title': {}}) pxml = ET.Element('article') pxml.append(ET.Element('front')) front = pxml.find('front') front.append(ET.Element('article-meta')) data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLArticleMetaArticleIdDOIPipe() raw, xml = xmlarticle.transform(data) # This try except is a trick to test the expected result of the # piped XML, once the precond method don't raise an exception # we try to check if the preconditioned pipe was called or not. try: xml.find( './front/article-meta/article-id[@pub-id-type="doi"]').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def test_xml_article_body_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'title': {}, 'body': { 'pt': 'body pt', 'es': 'body es' } }) pxml = ET.Element('article') data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLBodyPipe() raw, xml = xmlarticle.transform(data) body = xml.find('./body/p').text self.assertEqual('body pt', body)
def test_xml_citation_date_with_year_and_month_and_day_pipe(self): fakexylosearticle = Article({ 'article': {}, 'title': {}, 'citations': [{ 'v65': [{ '_': '20060430' }] }] }).citations[0] pxml = ET.Element('ref') pxml.append(ET.Element('element-citation')) data = [fakexylosearticle, pxml] raw, xml = self._xmlcitation.DatePipe().transform(data) expected_year = xml.find('./element-citation/date/year').text expected_month = xml.find('./element-citation/date/month').text expected_day = xml.find('./element-citation/date/day').text self.assertEqual(u'2006', expected_year) self.assertEqual(u'04', expected_month) self.assertEqual(u'30', expected_day)
def test_xmlarticle_meta_keywords_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'title': {} }) pxml = ET.Element('article') pxml.append(ET.Element('front')) front = pxml.find('front') front.append(ET.Element('article-meta')) data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLArticleMetaKeywordsPipe() raw, xml = xmlarticle.transform(data) keywords_language = xml.find('./front/article-meta/kwd-group') self.assertEqual(None, keywords_language)
def test_xmlarticle_meta_translated_abstract_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v40': [{ '_': 'pt' }] }, 'title': {} }) pxml = ET.Element('article') pxml.append(ET.Element('front')) front = pxml.find('front') front.append(ET.Element('article-meta')) data = [fakexylosearticle, pxml] xmlarticle = export_rsps.XMLArticleMetaAbstractsPipe() raw, xml = xmlarticle.transform(data) abstract = xml.find('./front/article-meta/trans-abstract/p') self.assertEqual(None, abstract)
def pipeline_rsps(self): xylose_article = Article(self._article) ppl = plumber.Pipeline( export_rsps.SetupArticlePipe(), export_rsps.XMLArticlePipe(), export_rsps.XMLFrontPipe(), export_rsps.XMLJournalMetaJournalIdPipe(), export_rsps.XMLJournalMetaJournalTitleGroupPipe(), export_rsps.XMLJournalMetaISSNPipe(), export_rsps.XMLJournalMetaPublisherPipe(), export_rsps.XMLArticleMetaArticleIdPublisherPipe(), export_rsps.XMLArticleMetaArticleIdDOIPipe(), export_rsps.XMLArticleMetaArticleCategoriesPipe(), export_rsps.XMLArticleMetaTitleGroupPipe(), export_rsps.XMLArticleMetaTranslatedTitleGroupPipe(), export_rsps.XMLArticleMetaContribGroupPipe(), export_rsps.XMLArticleMetaAffiliationPipe(), export_rsps.XMLArticleMetaDatesInfoPipe(), export_rsps.XMLArticleMetaIssueInfoPipe(), export_rsps.XMLArticleMetaElocationInfoPipe(), export_rsps.XMLArticleMetaPagesInfoPipe(), export_rsps.XMLArticleMetaHistoryPipe(), export_rsps.XMLArticleMetaPermissionPipe(), export_rsps.XMLArticleMetaAbstractsPipe(), export_rsps.XMLArticleMetaKeywordsPipe(), export_rsps.XMLArticleMetaCountsPipe(), export_rsps.XMLBodyPipe(), export_rsps.XMLArticleMetaCitationsPipe(), export_rsps.XMLSubArticlePipe(), export_rsps.XMLClosePipe()) transformed_data = ppl.run(xylose_article, rewrap=True) return next(transformed_data)
def test_xmlarticle_meta_contrib_group_author_without_xrefs_pipe(self): del (self._raw_json['article']['v71']) article_meta = Article(self._raw_json) pxml = ET.Element('article') pxml.append(ET.Element('front')) front = pxml.find('front') front.append(ET.Element('article-meta')) data = [self._article_meta, pxml] xmlarticle = export_rsps.XMLArticleMetaContribGroupPipe() raw, xml = xmlarticle.transform(data) fullnames = [ i.get('rid') for i in xml.findall( './front/article-meta/contrib-group/contrib/xref') ] self.assertEqual([ u'aff01', u'aff01', u'aff01', u'aff01', u'aff01', u'aff01', u'aff02', u'aff01', u'aff02', u'aff01', u'aff03' ], fullnames)
def pipeline_doaj(self): xylose_article = Article(self._article, iso_format='iso 639-2') ppl = plumber.Pipeline(export_doaj.SetupArticlePipe(), export_doaj.XMLArticlePipe(), export_doaj.XMLJournalMetaPublisherPipe(), export_doaj.XMLJournalMetaJournalTitlePipe(), export_doaj.XMLJournalMetaISSNPipe(), export_doaj.XMLArticleMetaPublicationDatePipe(), export_doaj.XMLArticleMetaVolumePipe(), export_doaj.XMLArticleMetaIssuePipe(), export_doaj.XMLArticleMetaStartPagePipe(), export_doaj.XMLArticleMetaEndPagePipe(), export_doaj.XMLArticleMetaArticleIdDOIPipe(), export_doaj.XMLArticleMetaIdPipe(), export_doaj.XMLArticleMetaDocumentTypePipe(), export_doaj.XMLArticleMetaTitlePipe(), export_doaj.XMLArticleMetaAuthorsPipe(), export_doaj.XMLArticleMetaAffiliationPipe(), export_doaj.XMLArticleMetaAbstractsPipe(), export_doaj.XMLArticleMetaFullTextUrlPipe(), export_doaj.XMLArticleMetaKeywordsPipe(), export_doaj.XMLClosePipe()) transformed_data = ppl.run(xylose_article, rewrap=True) return next(transformed_data)
def test_xml_document_multiple_wok_subject_categories_pipe(self): fakexylosearticle = Article({ 'article': {}, 'title': { 'v854': [{ '_': 'Cat 1' }, { '_': 'Cat 2' }] } }) pxml = ET.Element('doc') data = [fakexylosearticle, pxml] xmlarticle = pipeline_xml.WOKSC() raw, xml = xmlarticle.transform(data) result = ', '.join([ i.text for i in xml.findall('./field[@name="wok_subject_categories"]') ]) self.assertEqual(u'Cat 1, Cat 2', result)
def test_should_return_issue_publication_date_if_it_is_presente(self): metadata = { "article": {"v65": [{"_": "19970300"}], "v223": [{"_": "20200124"}],}, } article = Article(metadata) __, issue_pubdate = conversion.get_article_dates(article) self.assertEqual(issue_pubdate, ("1997", "03", ""))
def test_xml_journal_title_pipe(self): fakexylosearticle = Article({ 'article': {}, 'title': { "v100": [{ "_": "Revista de Sa\u00fade P\u00fablica" }] } }) pxml = ET.Element('doc') data = [fakexylosearticle, pxml] xmlarticle = pipeline_xml.JournalTitle() raw, xml = xmlarticle.transform(data) # This try except is a trick to test the expected result of the # piped XML, once the precond method don't raise an exception # we try to check if the preconditioned pipe was called or not. try: xml.find('./field[name="journal"]').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def test_should_return_creation_date_if_no_document_publication_date(self): metadata = { "article": {"v65": [{"_": "19970300"}], "v93": [{"_": "20000401"}],}, } article = Article(metadata) document_pubdate, __ = conversion.get_article_dates(article) self.assertEqual(document_pubdate, ("2000", "04", "01"))
def test_xml_start_page_pipe(self): fakexylosearticle = Article({ 'article': { "v14": [{ "l": "649", "_": "", "f": "639" }] }, 'title': {} }) pxml = ET.Element('doc') data = [fakexylosearticle, pxml] xmlarticle = pipeline_xml.StartPage() raw, xml = xmlarticle.transform(data) # This try except is a trick to test the expected result of the # piped XML, once the precond method don't raise an exception # we try to check if the preconditioned pipe was called or not. try: xml.find('./field[name="start_page"]').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def test_xmlarticle_meta_general_info_fulltext_uri_without_data_pipe(self): fakexylosearticle = Article({ 'article': { 'v65': [{ '_': '201008' }] }, 'title': {} }) pxml = ET.Element('records') pxml.append(ET.Element('record')) data = [fakexylosearticle, pxml] xmlarticle = export_doaj.XMLArticleMetaFullTextUrlPipe() raw, xml = xmlarticle.transform(data) try: xml.find('./record/issue').text except AttributeError: self.assertTrue(True) else: self.assertTrue(False)
def query_by_meta(coll, title='', author='', year=''): article_meta = {} article_meta['title'] = title article_meta['author'] = author article_meta['year'] = year title_key = preparing_key(title, author, year) if not title_key: return None query = coll.find({'citations_keys': title_key}, { 'article': 1, 'title': 1, 'collection': 1 }) citations = None if query: citations = [] for doc in query: citation = Article(doc) meta = load_document_meta(citation) citations.append(meta) return {'article': article_meta, 'cited_by': citations}
def document(self, code, collection=None, replace_journal_metadata=True, fmt='xylose'): query = { 'code': code, 'replace_journal_metadata': replace_journal_metadata, 'fmt': fmt } if collection: query['collection'] = collection try: article = self.client.get_article(**query) except: msg = 'Error retrieving document: %s_%s' % (collection, code) raise ServerError(msg) if fmt == 'xylose': jarticle = json.loads(article) xarticle = Article(jarticle) logger.info('Document loaded: %s_%s' % (collection, code)) return xarticle else: logger.info('Document loaded: %s_%s' % (collection, code)) return article
def query_by_pid(coll, pid): article = load_article(coll, pid) if not article: return None title_keys = load_article_title_keys(article) query = coll.find({'citations_keys': { '$in': title_keys }}, { 'article': 1, 'title': 1, 'collection': 1 }) citations = None if query: citations = [] for doc in query: citation = Article(doc) meta = load_document_meta(citation) citations.append(meta) article_meta = load_document_meta(article) return {'article': article_meta, 'cited_by': citations}
def document(self, code, collection, replace_journal_metadata=True, fmt='xylose', body=False): article = self.dispatcher('get_article', code=code, collection=collection, replace_journal_metadata=True, fmt=fmt, body=body) if not article: logger.warning('Document not found for: %s_%s', collection, code) return None if fmt in ['xylose', 'opac']: jarticle = None try: jarticle = json.loads(article) except: msg = 'Fail to load JSON when retrienving document: %s_%s' % ( collection, code) raise ValueError(msg) xarticle = Article(jarticle) logger.info('Document loaded: %s_%s', collection, code) return xarticle logger.info('Document loaded: %s_%s', collection, code) return article
def pipeline_pubmed(self): xylose_article = Article(self._article, iso_format='iso 639-2') ppl = plumber.Pipeline(export_pubmed.SetupArticleSetPipe(), export_pubmed.XMLArticlePipe(), export_pubmed.XMLJournalPipe(), export_pubmed.XMLPublisherNamePipe(), export_pubmed.XMLJournalTitlePipe(), export_pubmed.XMLISSNPipe(), export_pubmed.XMLVolumePipe(), export_pubmed.XMLIssuePipe(), export_pubmed.XMLPubDatePipe(), export_pubmed.XMLReplacesPipe(), export_pubmed.XMLArticleTitlePipe(), export_pubmed.XMLFirstPagePipe(), export_pubmed.XMLLastPagePipe(), export_pubmed.XMLElocationIDPipe(), export_pubmed.XMLLanguagePipe(), export_pubmed.XMLAuthorListPipe(), export_pubmed.XMLPublicationTypePipe(), export_pubmed.XMLArticleIDListPipe(), export_pubmed.XMLHistoryPipe(), export_pubmed.XMLAbstractPipe(), export_pubmed.XMLClosePipe()) transformed_data = ppl.run(xylose_article, rewrap=True) return next(transformed_data)
def pipeline_sci(self): xylose_article = Article(self._article) ppl = plumber.Pipeline( export_sci.SetupArticlePipe(), export_sci.XMLArticlePipe(), export_sci.XMLFrontPipe(), export_sci.XMLJournalMetaJournalIdPipe(), export_sci.XMLJournalMetaJournalTitleGroupPipe(), export_sci.XMLJournalMetaISSNPipe(), export_sci.XMLJournalMetaCollectionPipe(), export_sci.XMLJournalMetaPublisherPipe(), export_sci.XMLArticleMetaUniqueArticleIdPipe(), export_sci.XMLArticleMetaArticleIdPublisherPipe(), export_sci.XMLArticleMetaArticleIdDOIPipe(), export_sci.XMLArticleMetaArticleCategoriesPipe(), export_sci.XMLArticleMetaTitleGroupPipe(), export_sci.XMLArticleMetaTranslatedTitleGroupPipe(), export_sci.XMLArticleMetaContribGroupPipe(), export_sci.XMLArticleMetaAffiliationPipe(), export_sci.XMLArticleMetaGeneralInfoPipe(), export_sci.XMLArticleMetaAbstractsPipe(), export_sci.XMLArticleMetaKeywordsPipe(), export_sci.XMLArticleMetaCitationsPipe(), export_sci.XMLClosePipe()) transformed_data = ppl.run(xylose_article, rewrap=True) return next(transformed_data)
def setUp(self): self._raw_json = json.loads( open(os.path.dirname(__file__) + '/fixtures/article_meta.json').read()) self._citation_meta = Article(self._raw_json).citations[0] self._xmlcitation = export_rsps.XMLCitation()