コード例 #1
0
ファイル: views.py プロジェクト: Huchikoma/internship_MMQ
def getData(request):
    """
    POST http://localhost/oai_pmh/api/getdata/
    POST data query='{"url":"value"}'
    """
    try:
        serializer = IdentifySerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.POST['url']
            if str(url).__contains__('?'):
                registryURl = str(url).split('?')[0]
                #Check if the OAI Registry is available
                sickle = Sickle(registryURl)
                sickle.Identify()
                http_response = requests.get(url)
                if http_response.status_code == status.HTTP_200_OK:
                    return Response(http_response.text, status=status.HTTP_200_OK)
                else:
                    raise OAIAPIException(message='An error occurred.', status=http_response.status_code)
            else:
                raise OAIAPIException(message='An error occurred, url malformed.', status=status.HTTP_400_BAD_REQUEST)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except requests.HTTPError, err:
        content = APIMessage.getMessageLabelled(err.message)
        return Response(content, status=err.response.status_code)
コード例 #2
0
ファイル: test_harvesting.py プロジェクト: tulibraries/sickle
class TestCase(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                              ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
コード例 #3
0
    def import_collections(self,
                           resourcesync_sourcedescription,
                           oaipmh_endpoint,
                           collection_keys=None,
                           institution_name=None,
                           resource_dir='resourcesync',
                           overwrite=False):
        '''
        Adds an institution's ResourceSync-able collections to the database.

        If `collection_keys` is specified, then add to the database only
        the collections specified by that list. Otherwise, add all collections 
        to the database.

        Args:
          resourcesync_sourcedescription: a ResourceSync SourceDescription URL
              see https://www.openarchives.org/rs/1.1/resourcesync#SourceDesc
          oaipmh_endpoint: a OAI-PMH base URL
              see https://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
          collection_keys: a list of collection keys specifying an
              exclusive list of collections to add to the database
          institution_name: human-readable name of the institution which
              should be used instead of its OAI-PMH repositoryName
          resource_dir: path to the local directory to store copies of the
              synced resources to, relative to the home directory "~"
          overwrite: whether or not to overwrite rows in the database that
              match the `collection_key` and `institution_key`

        Returns:
          None
        '''
        rs_soup = BeautifulSoup(
            get(resourcesync_sourcedescription).content, 'xml')
        capabilitylist_urls = [a.string for a in rs_soup.find_all('loc')]

        sickle = Sickle(oaipmh_endpoint)
        sets = sickle.ListSets()
        identify = sickle.Identify()

        set_spec_to_name = {z.setSpec: z.setName for z in sets}
        url_map_from = '/'.join(oaipmh_endpoint.split(sep='/')[:-1]) + '/'

        i_name = institution_name if institution_name is not None else identify.repositoryName

        has_capability = lambda c, tag: tag.md is not None and 'capability' in tag.md.attrs and tag.md[
            'capability'] == c

        for capabilitylist_url in capabilitylist_urls:

            # For now, get setSpec from the path component of the CapabilityList URL (which may have percent-encoded characters)
            set_spec = urllib.parse.unquote(
                urllib.parse.urlparse(capabilitylist_url).path.split(
                    sep='/')[2])

            # If a subset of collections is specified, only add collections that belong to it. Otherwise, add all collections.
            if collection_keys is None or (collection_keys is not None
                                           and set_spec in collection_keys):

                r_soup = BeautifulSoup(get(capabilitylist_url).content, 'xml')

                # ResourceList should always exist, but if it doesn't, log it and skip this collection
                try:
                    resourcelist_url = r_soup.find(
                        functools.partial(has_capability,
                                          'resourcelist')).loc.string
                except AttributeError:
                    # TODO: log it
                    pass
                    continue

                # If no ChangeList exists yet, that's ok; predict what its URL will be
                try:
                    changelist_url = r_soup.find(
                        functools.partial(has_capability,
                                          'changelist')).loc.string
                except AttributeError:
                    changelist_url = '/'.join(
                        resourcelist_url.split(sep='/')[:-1] +
                        ['changelist_0000.xml'])

                print(
                    self.__collection_identifier(i_name,
                                                 identify.repositoryIdentifier,
                                                 set_spec_to_name[set_spec],
                                                 set_spec))

                # We can add the collection to the database now
                # TODO: catch exceptions
                self.__insert_or_update(identify.repositoryIdentifier, i_name,
                                        set_spec, set_spec_to_name[set_spec],
                                        resourcelist_url, changelist_url,
                                        url_map_from, resource_dir, overwrite)
コード例 #4
0
def coletar_PERIODICO(provedores, nomearquivo):
    # cria o dataframe para guardar o resultado da coleta
    resultado = pd.DataFrame(
        columns=['title', 'creator', 'contributor', 'subject', 'description', 'coverage', 'date', 'format',
                 'identifier',
                 'language', 'provider', 'publisher', 'relation', 'rights', 'source', 'type', 'setSpec'])

    st.write('Iniciando a coleta....')
    contadorgeral = 0  # conta o total de registros coletados de todos os provedores

    for n in range(len(provedores['titulo'])):  # percorre a planilha dos provedores

        try:
            provider = provedores['titulo'][n]  # armazena a sigla da instituição
            url_provider = provedores['url'][n]  # armazena a url do provedor

            st.write('Coletando o provedor : ', provider)

            # inicializa o provedor
            sickle = Sickle(url_provider)
            identify = sickle.Identify()  # identifica o provedor e já verifica se está respondendo no endpoint
            if (identify):

                sets = sickle.ListSets()
                for conjuntorevista in sets:
                    conjunto = conjuntorevista.setSpec

                    st.write("Coletando o conjunto: ", conjunto)

                    # Tenta coletar o conjunto de registros especificados
                    # Pode resultar em erro caso o conjunto de registros retorne 0 como resultado
                    try:
                        registros = sickle.ListRecords(
                            **{'metadataPrefix': 'oai_dc', 'set': conjunto, 'from': ano + '-01-01'})
                        contador = 0

                        # itera pelo conjunto de registros identificados
                        for registro in registros:
                            contador = contador + 1

                            # recupera os metadados de cada registro em formato dicionario
                            metadados = registro.metadata

                            # recupera os metadados individualmente.
                            # Os campos podem ser multivalorados. Para isso, é preciso extrair item por item da lista de cada metadado.
                            # Também precisa tratar exceção para caso o metadado não exista no repositório
                            title = 'DADO AUSENTE NO PROVEDOR'
                            creator = 'DADO AUSENTE NO PROVEDOR'
                            contributor = 'DADO AUSENTE NO PROVEDOR'
                            subject = 'DADO AUSENTE NO PROVEDOR'
                            description = 'DADO AUSENTE NO PROVEDOR'
                            coverage = 'DADO AUSENTE NO PROVEDOR'
                            datem = 'DADO AUSENTE NO PROVEDOR'
                            formatm = 'DADO AUSENTE NO PROVEDOR'
                            identifier = 'DADO AUSENTE NO PROVEDOR'
                            language = 'DADO AUSENTE NO PROVEDOR'
                            provider = 'DADO AUSENTE NO PROVEDOR'
                            publisher = 'DADO AUSENTE NO PROVEDOR'
                            relation = 'DADO AUSENTE NO PROVEDOR'
                            rights = 'DADO AUSENTE NO PROVEDOR'
                            source = 'DADO AUSENTE NO PROVEDOR'
                            typem = 'DADO AUSENTE NO PROVEDOR'

                            # METADADO TITLE
                            if 'title' in metadados:
                                i = 0
                                for titulo in metadados['title']:
                                    if i == 0:
                                        title = titulo
                                        i = i + 1
                                    else:
                                        title = title + "||" + titulo

                            # METADADO CREATOR
                            if 'creator' in metadados:
                                i = 0
                                for criador in metadados['creator']:
                                    if i == 0:
                                        creator = criador
                                        i = i + 1
                                    else:
                                        creator = creator + "||" + criador

                            # METADADO CONTRIBUTOR
                            if 'contributor' in metadados:
                                i = 0
                                for contribuidor in metadados['contributor']:
                                    if i == 0:
                                        contributor = contribuidor
                                        i = i + 1
                                    else:
                                        contributor = contributor + "||" + contribuidor

                            # METADADO SUBJECT
                            if 'subject' in metadados:
                                i = 0
                                for assunto in metadados['subject']:
                                    if i == 0:
                                        subject = assunto
                                        i = i + 1
                                    else:
                                        subject = subject + "||" + assunto

                            # METADADO DESCRIPTION
                            if 'description' in metadados:
                                i = 0
                                for descricao in metadados['description']:
                                    if i == 0:
                                        description = descricao
                                        i = i + 1
                                    else:
                                        description = description + "||" + descricao

                            # METADADO COVERAGE
                            if 'coverage' in metadados:
                                i = 0
                                for cobertura in metadados['coverage']:
                                    if i == 0:
                                        coverage = cobertura
                                        i = i + 1
                                    else:
                                        coverage = coverage + "||" + cobertura

                            # METADADO DATE
                            if 'date' in metadados:
                                i = 0
                                for data in metadados['date']:
                                    if i == 0:
                                        datem = data
                                        i = i + 1
                                    else:
                                        datem = datem + "||" + data

                            # METADADO FORMAT
                            if 'format' in metadados:
                                i = 0
                                for formato in metadados['format']:
                                    if i == 0:
                                        formatm = formato
                                        i = i + 1
                                    else:
                                        formatm = format + "||" + formato

                            # METADADO IDENTIFIER
                            if 'identifier' in metadados:
                                i = 0
                                for ide in metadados['identifier']:
                                    if i == 0:
                                        identifier = ide
                                        i = i + 1
                                    else:
                                        identifier = identifier + "||" + ide

                            # METADADO LANGUAGE
                            if 'language' in metadados:
                                i = 0
                                for lingua in metadados['language']:
                                    if i == 0:
                                        language = lingua
                                        i = i + 1
                                    else:
                                        language = language + "||" + lingua

                            # METADADO PROVIDER
                            if 'provider' in metadados:
                                i = 0
                                for provedor in metadados['provider']:
                                    if i == 0:
                                        provider = provedor
                                        i = i + 1
                                    else:
                                        provider = provider + "||" + provedor
                            else:
                                provider = provedores['titulo'][n]

                            # METADADO PUBLISHER
                            if 'publisher' in metadados:
                                i = 0
                                for publicador in metadados['publisher']:
                                    if i == 0:
                                        publisher = publicador
                                        i = i + 1
                                    else:
                                        publisher = publisher + "||" + publicador

                            # METADADO RELATION
                            if 'relation' in metadados:
                                i = 0
                                for relacao in metadados['relation']:
                                    if i == 0:
                                        relation = relacao
                                        i = i + 1
                                    else:
                                        relation = relation + "||" + relacao

                            # METADADO RIGHTS
                            if 'rights' in metadados:
                                i = 0
                                for direitos in metadados['rights']:
                                    if i == 0:
                                        rights = direitos
                                        i = i + 1
                                    else:
                                        rights = rights + "||" + direitos

                            # METADADO SOURCE
                            if 'source' in metadados:
                                i = 0
                                for fonte in metadados['source']:
                                    if i == 0:
                                        source = fonte
                                        i = i + 1
                                    else:
                                        source = source + "||" + fonte

                            # METADADO TYPE
                            if 'type' in metadados:
                                i = 0
                                for tipo in metadados['type']:
                                    if i == 0:
                                        typem = tipo
                                        i = i + 1
                                    else:
                                        typem = typem + "||" + tipo

                            setSpec = conjunto

                            # monta dataframe com os metadados coletados
                            metadadoscoletados = [
                                [title, creator, contributor, subject, description, coverage, datem, formatm,
                                 identifier, language, provider, publisher, relation, rights, source, typem, setSpec]]

                            dadoscoletados = pd.DataFrame(metadadoscoletados,
                                                          columns=['title', 'creator', 'contributor', 'subject',
                                                                   'description', 'coverage', 'date', 'format',
                                                                   'identifier', 'language', 'provider', 'publisher',
                                                                   'relation', 'rights', 'source', 'type', 'setSpec'])

                            # inclui os resultados no dataframe
                            resultado = pd.concat([resultado, dadoscoletados], sort=False)

                        st.write('Registros coletados : ', contador)

                        contadorgeral = contadorgeral + contador

                    except Exception as e:
                        st.write('Sem atualizações no provedor : ', provider)
                        st.write('**************************************************************')
                        continue

            st.write('**************************************************************')

        except Exception as e:
            print(e)
            st.write('Erro no provedor')
            st.write('**************************************************************')
            continue

    st.write(resultado['provider'].value_counts())
    st.write('TOTAL DE REGISTROS COLETADOS DE TODOS OS PROVEDORES: ', contadorgeral)
    st.write('************** FIM DA COLETA **************')

    resultado.to_csv(nomearquivo, index=False)