def __init__(self, collection, issns=None, output_file=None, from_date=FROM, user=None, password=None, api_token=None): self._articlemeta = utils.articlemeta_server() self.collection = collection self.from_date = from_date self.user = user self.password = password self.issns = issns or [None] self.session = self.authenticated_session() self.parse_schema() self.doaj_articles = Articles(usertoken=api_token)
class ArticlesTest(unittest.TestCase): def setUp(self): self.ac = Articles() def test_get_article(self): response = self.ac.get('255723f2f2374f1fbb8865eeb044c9d2') self.assertTrue('bibjson' in response) def test_get_article_not_available(self): response = self.ac.get('not available') self.assertEqual(response['status'], 'not_found') def test_search_invalid_page_size(self): with self.assertRaises(ValueError): [x for x in self.ac.search('issn:1806-9940', pagesize='invalid')] def test_search_invalid_page_size_1(self): """ not between 10 and 100. """ with self.assertRaises(ValueError): [x for x in self.ac.search('issn:1806-9940', pagesize=9)] def test_search_invalid_page_size_2(self): """ not between 10 and 100. """ with self.assertRaises(ValueError): [x for x in self.ac.search('issn:1806-9940', pagesize=101)] def test_search(self): articles = [x for x in self.ac.search('id:255723f2f2374f1fbb8865eeb044c9d2')] self.assertEqual(1, len(articles))
def setUp(self): self.ac = Articles()
class Dumper(object): def __init__(self, collection, issns=None, output_file=None, from_date=FROM, user=None, password=None, api_token=None): self._articlemeta = utils.articlemeta_server() self.collection = collection self.from_date = from_date self.user = user self.password = password self.issns = issns or [None] self.session = self.authenticated_session() self.parse_schema() self.doaj_articles = Articles(usertoken=api_token) def _doaj_id_by_meta(self, issn, publication_year, title): ### Query by metadata escaped_title = '' for char in title: if char in ['+','-','&','|','!','(',')','{','}','[',']','^','"','~','*','?',':','\\']: escaped_title += u'\\'+char continue escaped_title += char query = 'issn:%s AND year:%s AND title:%s' % ( issn, publication_year, escaped_title ) result = [] try: result = [i for i in self.doaj_articles.search(query)] except: logger.debug('Fail to query DOAJ API using metadata: %s' % query) if len(result) == 1: return result[0].get('id', None) def _doaj_id_by_doi(self, doi): ### Query by doi query = 'doi:%s' % (doi) result = [] try: result = [i for i in self.doaj_articles.search(query)] except: logger.debug('Fail to query DOAJ API using DOI: %s' % query) if len(result) == 1: return result[0].get('id', None) def _doaj_id(self, document): doaj_id = None if document.original_title(): doaj_id = self._doaj_id_by_meta( document.scielo_issn, document.publication_date[0:4], document.original_title() ) if doaj_id: return doaj_id if document.doi: return self._doaj_id_by_doi(document.doi) def parse_schema(self): xsd = BytesIO(DOAJ_XSD.encode('utf-8')) try: sch_doc = etree.parse(xsd) sch = etree.XMLSchema(sch_doc) except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False self.doaj_schema = sch def authenticated_session(self): auth_url = 'https://doaj.org/account/login' login = {'username': self.user, 'password': self.password} session = requests.Session() try: request = session.post(auth_url, data=login) except requests.exceptions.SSLError: logger.debug('Authentication without SSL validation') request = session.post(auth_url, data=login, verify=False) if request.status_code != 200: logger.debug('Authentication attempt done') return None if u'Incorrect' in request.text: logger.debug('Incorrect username or password') return None logger.debug('Authenticated successfully') return session def xml_is_valid(self, xml): try: xml = StringIO(xml) xml_doc = etree.parse(xml) logger.debug('XML is well formed') except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False try: result = self.doaj_schema.assertValid(xml_doc) logger.debug('XML is valid') return True except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False def send_xml(self, file_name, file_data): files = {'file': (file_name, file_data)} try: response = self.session.post( 'https://doaj.org/publisher/uploadfile', data={'schema': 'doaj'}, files=files ) except requests.ConnectionError: logger.debug('Fail to send document to DOAJ') return False if u'successfully uploaded' in response.text: logger.info('Document Sent') return True else: self.authenticated_session() logger.error('Document not Sent: %s' % response.status_code) return False def run(self): if not self.session: return None for issn in self.issns: for document in self._articlemeta.documents(collection=self.collection, issn=issn, from_date=self.from_date): logger.info('Reading document: %s_%s' % (document.publisher_id, document.collection_acronym)) if document.data.get('doaj_id', None): logger.debug('Document already available in DOAJ: %s_%s' % (document.publisher_id, document.collection_acronym)) continue doaj_id = self._doaj_id(document) if doaj_id: logger.debug('Document already available in DOAJ, setting id on Article Meta for: %s_%s' % (document.publisher_id, document.collection_acronym)) self._articlemeta.set_doaj_id(document.publisher_id, document.collection_acronym, doaj_id) continue try: xml = self._articlemeta.document(document.publisher_id, document.collection_acronym, fmt='xmldoaj') except Exception as e: logger.exception(e) logger.error('Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym)) xml = u'' if not self.xml_is_valid(xml): logger.error('Fail to parse xml document: %s_%s' % (document.publisher_id, document.collection_acronym)) continue logger.info('Sending document: %s_%s' % (document.publisher_id, document.collection_acronym)) filename = '%s_%s.xml' % (document.publisher_id, document.collection_acronym) self.send_xml(filename, xml)
class Dumper(object): def __init__(self, collection, issns=None, output_file=None, from_date=FROM, user=None, password=None, api_token=None): self._articlemeta = utils.articlemeta_server() self.collection = collection self.from_date = from_date self.user = user self.password = password self.issns = issns or [None] self.session = self.authenticated_session() self.parse_schema() self.doaj_articles = Articles(usertoken=api_token) def _doaj_id_by_meta(self, issn, publication_year, title): ### Query by metadata escaped_title = '' for char in title: if char in [ '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\' ]: escaped_title += u'\\' + char continue escaped_title += char query = 'issn:%s AND year:%s AND title:%s' % (issn, publication_year, escaped_title) result = [] try: result = [i for i in self.doaj_articles.search(query)] except: logger.debug('Fail to query DOAJ API using metadata: %s' % query) if len(result) == 1: return result[0].get('id', None) def _doaj_id_by_doi(self, doi): ### Query by doi query = 'doi:%s' % (doi) result = [] try: result = [i for i in self.doaj_articles.search(query)] except: logger.debug('Fail to query DOAJ API using DOI: %s' % query) if len(result) == 1: return result[0].get('id', None) def _doaj_id(self, document): doaj_id = None if document.original_title(): doaj_id = self._doaj_id_by_meta(document.journal.scielo_issn, document.publication_date[0:4], document.original_title()) if doaj_id: return doaj_id if document.doi: return self._doaj_id_by_doi(document.doi) def parse_schema(self): xsd = BytesIO(DOAJ_XSD.encode('utf-8')) try: sch_doc = etree.parse(xsd) sch = etree.XMLSchema(sch_doc) except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False self.doaj_schema = sch def authenticated_session(self): auth_url = 'https://doaj.org/account/login' login = {'username': self.user, 'password': self.password} session = requests.Session() try: request = session.post(auth_url, data=login) except requests.exceptions.SSLError: logger.debug('Authentication without SSL validation') request = session.post(auth_url, data=login, verify=False) if request.status_code != 200: logger.debug('Authentication attempt done') return None if u'Incorrect' in request.text: logger.debug('Incorrect username or password') return None logger.debug('Authenticated successfully') return session def xml_is_valid(self, xml): try: xml = StringIO(xml) xml_doc = etree.parse(xml) logger.debug('XML is well formed') except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False try: result = self.doaj_schema.assertValid(xml_doc) logger.debug('XML is valid') return True except Exception as e: logger.exception(e) logger.error('Fail to parse XML') return False def send_xml(self, file_name, file_data): files = {'file': (file_name, file_data)} try: response = self.session.post( 'https://doaj.org/publisher/uploadfile', data={'schema': 'doaj'}, files=files) except requests.ConnectionError: logger.debug('Fail to send document to DOAJ') return False if u'File uploaded and waiting to be processed' in response.text: logger.info('Document Sent') return True else: self.authenticated_session() logger.error('Document not Sent: %s' % response.status_code) return False def run(self): if not self.session: return None extra_filter = json.dumps({'doaj_id': {'$exists': 0}}) for issn in self.issns: for document in self._articlemeta.documents( collection=self.collection, issn=issn, from_date=self.from_date, extra_filter=extra_filter): logger.info( 'Reading document: %s_%s' % (document.publisher_id, document.collection_acronym)) if document.data.get('doaj_id', None): logger.debug( 'Document already available in DOAJ: %s_%s' % (document.publisher_id, document.collection_acronym)) continue doaj_id = self._doaj_id(document) if doaj_id: logger.debug( 'Document already available in DOAJ, setting id on Article Meta for: %s_%s' % (document.publisher_id, document.collection_acronym)) self._articlemeta.set_doaj_id(document.publisher_id, document.collection_acronym, doaj_id) continue try: xml = self._articlemeta.document( document.publisher_id, document.collection_acronym, fmt='xmldoaj') except Exception as e: logger.exception(e) logger.error( 'Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym)) xml = u'' if not self.xml_is_valid(xml): logger.error( 'Fail to parse xml document: %s_%s' % (document.publisher_id, document.collection_acronym)) continue logger.info( 'Sending document: %s_%s' % (document.publisher_id, document.collection_acronym)) filename = '%s_%s.xml' % (document.publisher_id, document.collection_acronym) self.send_xml(filename, xml)