Beispiel #1
0
class ArticlesTest(unittest.TestCase):

    def setUp(self):
        self.ac = Articles()

    def test_get_article(self):
        response = self.ac.get('255723f2f2374f1fbb8865eeb044c9d2')

        self.assertTrue('bibjson' in response)

    def test_get_article_not_available(self):
        response = self.ac.get('not available')

        self.assertEqual(response['status'], 'not_found')

    def test_search_invalid_page_size(self):
        
        with self.assertRaises(ValueError):
            [x for x in self.ac.search('issn:1806-9940', pagesize='invalid')]

    def test_search_invalid_page_size_1(self):
        """
        not between 10 and 100.
        """
        with self.assertRaises(ValueError):
            [x for x in self.ac.search('issn:1806-9940', pagesize=9)]

    def test_search_invalid_page_size_2(self):
        """
        not between 10 and 100.
        """
        with self.assertRaises(ValueError):
            [x for x in self.ac.search('issn:1806-9940', pagesize=101)]

    def test_search(self):

        articles = [x for x in self.ac.search('id:255723f2f2374f1fbb8865eeb044c9d2')]

        self.assertEqual(1, len(articles))
Beispiel #2
0
class Dumper(object):

    def __init__(self, collection, issns=None, output_file=None, from_date=FROM, 
        user=None, password=None, api_token=None):

        self._articlemeta = utils.articlemeta_server()
        self.collection = collection
        self.from_date = from_date
        self.user = user
        self.password = password
        self.issns = issns or [None]
        self.session = self.authenticated_session()
        self.parse_schema()
        self.doaj_articles = Articles(usertoken=api_token)


    def _doaj_id_by_meta(self, issn, publication_year, title):
        ### Query by metadata

        escaped_title = ''

        for char in title:
            if char in ['+','-','&','|','!','(',')','{','}','[',']','^','"','~','*','?',':','\\']:
                escaped_title += u'\\'+char
                continue
            escaped_title += char

        query = 'issn:%s AND year:%s AND title:%s' % (
            issn,
            publication_year,
            escaped_title
        )

        result = []

        try:
            result = [i for i in self.doaj_articles.search(query)]
        except:
            logger.debug('Fail to query DOAJ API using metadata: %s' % query)

        if len(result) == 1:
            return result[0].get('id', None)

    def _doaj_id_by_doi(self, doi):
        ### Query by doi
        query = 'doi:%s' % (doi)

        result = []
        try:
            result = [i for i in self.doaj_articles.search(query)]
        except:
            logger.debug('Fail to query DOAJ API using DOI: %s' % query)

        if len(result) == 1:
            return result[0].get('id', None)

    def _doaj_id(self, document):

        doaj_id = None

        if document.original_title():
            doaj_id = self._doaj_id_by_meta(
                document.scielo_issn,
                document.publication_date[0:4],
                document.original_title()
            )

        if doaj_id:
            return doaj_id

        if document.doi:
            return self._doaj_id_by_doi(document.doi)

    def parse_schema(self):
        xsd = BytesIO(DOAJ_XSD.encode('utf-8'))
        try:
            sch_doc = etree.parse(xsd)
            sch = etree.XMLSchema(sch_doc)
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False
        
        self.doaj_schema = sch

    def authenticated_session(self):
        auth_url = 'https://doaj.org/account/login'
        login = {'username': self.user, 'password': self.password}

        session = requests.Session()
        try:
            request = session.post(auth_url, data=login)
        except requests.exceptions.SSLError:
            logger.debug('Authentication without SSL validation')
            request = session.post(auth_url, data=login, verify=False)

        if request.status_code != 200:
            logger.debug('Authentication attempt done')
            return None

        if u'Incorrect' in request.text:
            logger.debug('Incorrect username or password')
            return None

        logger.debug('Authenticated successfully')

        return session

    def xml_is_valid(self, xml):
        
        try: 
            xml = StringIO(xml)
            xml_doc = etree.parse(xml)
            logger.debug('XML is well formed')
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False

        try:
            result = self.doaj_schema.assertValid(xml_doc)
            logger.debug('XML is valid')
            return True
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False

    def send_xml(self, file_name, file_data):
        files = {'file': (file_name, file_data)}

        try:
            response = self.session.post(
                'https://doaj.org/publisher/uploadfile',
                data={'schema': 'doaj'},
                files=files
            )
        except requests.ConnectionError:
            logger.debug('Fail to send document to DOAJ')
            return False

        if u'successfully uploaded' in response.text:
            logger.info('Document Sent')
            return True
        else:
            self.authenticated_session()
            logger.error('Document not Sent: %s' % response.status_code)
            return False

    def run(self):
        if not self.session:
            return None

        for issn in self.issns:
            for document in self._articlemeta.documents(collection=self.collection, issn=issn, from_date=self.from_date):
                logger.info('Reading document: %s_%s' % (document.publisher_id, document.collection_acronym))

                if document.data.get('doaj_id', None):
                    logger.debug('Document already available in DOAJ: %s_%s' % (document.publisher_id, document.collection_acronym))
                    continue

                doaj_id = self._doaj_id(document)

                if doaj_id:
                    logger.debug('Document already available in DOAJ, setting id on Article Meta for: %s_%s' % (document.publisher_id, document.collection_acronym))
                    self._articlemeta.set_doaj_id(document.publisher_id, document.collection_acronym, doaj_id)
                    continue

                try:
                    xml = self._articlemeta.document(document.publisher_id, document.collection_acronym, fmt='xmldoaj')
                except Exception as e:
                    logger.exception(e)
                    logger.error('Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym))
                    xml = u''

                if not self.xml_is_valid(xml):
                    logger.error('Fail to parse xml document: %s_%s' % (document.publisher_id, document.collection_acronym))
                    continue

                logger.info('Sending document: %s_%s' % (document.publisher_id, document.collection_acronym))
                filename = '%s_%s.xml' % (document.publisher_id, document.collection_acronym)

                self.send_xml(filename, xml)
Beispiel #3
0
class Dumper(object):
    def __init__(self,
                 collection,
                 issns=None,
                 output_file=None,
                 from_date=FROM,
                 user=None,
                 password=None,
                 api_token=None):

        self._articlemeta = utils.articlemeta_server()
        self.collection = collection
        self.from_date = from_date
        self.user = user
        self.password = password
        self.issns = issns or [None]
        self.session = self.authenticated_session()
        self.parse_schema()
        self.doaj_articles = Articles(usertoken=api_token)

    def _doaj_id_by_meta(self, issn, publication_year, title):
        ### Query by metadata

        escaped_title = ''

        for char in title:
            if char in [
                    '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^',
                    '"', '~', '*', '?', ':', '\\'
            ]:
                escaped_title += u'\\' + char
                continue
            escaped_title += char

        query = 'issn:%s AND year:%s AND title:%s' % (issn, publication_year,
                                                      escaped_title)

        result = []

        try:
            result = [i for i in self.doaj_articles.search(query)]
        except:
            logger.debug('Fail to query DOAJ API using metadata: %s' % query)

        if len(result) == 1:
            return result[0].get('id', None)

    def _doaj_id_by_doi(self, doi):
        ### Query by doi
        query = 'doi:%s' % (doi)

        result = []
        try:
            result = [i for i in self.doaj_articles.search(query)]
        except:
            logger.debug('Fail to query DOAJ API using DOI: %s' % query)

        if len(result) == 1:
            return result[0].get('id', None)

    def _doaj_id(self, document):

        doaj_id = None

        if document.original_title():
            doaj_id = self._doaj_id_by_meta(document.journal.scielo_issn,
                                            document.publication_date[0:4],
                                            document.original_title())

        if doaj_id:
            return doaj_id

        if document.doi:
            return self._doaj_id_by_doi(document.doi)

    def parse_schema(self):
        xsd = BytesIO(DOAJ_XSD.encode('utf-8'))
        try:
            sch_doc = etree.parse(xsd)
            sch = etree.XMLSchema(sch_doc)
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False

        self.doaj_schema = sch

    def authenticated_session(self):
        auth_url = 'https://doaj.org/account/login'
        login = {'username': self.user, 'password': self.password}

        session = requests.Session()
        try:
            request = session.post(auth_url, data=login)
        except requests.exceptions.SSLError:
            logger.debug('Authentication without SSL validation')
            request = session.post(auth_url, data=login, verify=False)

        if request.status_code != 200:
            logger.debug('Authentication attempt done')
            return None

        if u'Incorrect' in request.text:
            logger.debug('Incorrect username or password')
            return None

        logger.debug('Authenticated successfully')

        return session

    def xml_is_valid(self, xml):

        try:
            xml = StringIO(xml)
            xml_doc = etree.parse(xml)
            logger.debug('XML is well formed')
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False

        try:
            result = self.doaj_schema.assertValid(xml_doc)
            logger.debug('XML is valid')
            return True
        except Exception as e:
            logger.exception(e)
            logger.error('Fail to parse XML')
            return False

    def send_xml(self, file_name, file_data):
        files = {'file': (file_name, file_data)}

        try:
            response = self.session.post(
                'https://doaj.org/publisher/uploadfile',
                data={'schema': 'doaj'},
                files=files)
        except requests.ConnectionError:
            logger.debug('Fail to send document to DOAJ')
            return False

        if u'File uploaded and waiting to be processed' in response.text:
            logger.info('Document Sent')
            return True
        else:
            self.authenticated_session()
            logger.error('Document not Sent: %s' % response.status_code)
            return False

    def run(self):
        if not self.session:
            return None

        extra_filter = json.dumps({'doaj_id': {'$exists': 0}})

        for issn in self.issns:
            for document in self._articlemeta.documents(
                    collection=self.collection,
                    issn=issn,
                    from_date=self.from_date,
                    extra_filter=extra_filter):
                logger.info(
                    'Reading document: %s_%s' %
                    (document.publisher_id, document.collection_acronym))

                if document.data.get('doaj_id', None):
                    logger.debug(
                        'Document already available in DOAJ: %s_%s' %
                        (document.publisher_id, document.collection_acronym))
                    continue

                doaj_id = self._doaj_id(document)

                if doaj_id:
                    logger.debug(
                        'Document already available in DOAJ, setting id on Article Meta for: %s_%s'
                        % (document.publisher_id, document.collection_acronym))
                    self._articlemeta.set_doaj_id(document.publisher_id,
                                                  document.collection_acronym,
                                                  doaj_id)
                    continue

                try:
                    xml = self._articlemeta.document(
                        document.publisher_id,
                        document.collection_acronym,
                        fmt='xmldoaj')
                except Exception as e:
                    logger.exception(e)
                    logger.error(
                        'Fail to read document: %s_%s' %
                        (document.publisher_id, document.collection_acronym))
                    xml = u''

                if not self.xml_is_valid(xml):
                    logger.error(
                        'Fail to parse xml document: %s_%s' %
                        (document.publisher_id, document.collection_acronym))
                    continue

                logger.info(
                    'Sending document: %s_%s' %
                    (document.publisher_id, document.collection_acronym))
                filename = '%s_%s.xml' % (document.publisher_id,
                                          document.collection_acronym)

                self.send_xml(filename, xml)