def getData(request): """ POST http://localhost/oai_pmh/api/getdata/ POST data query='{"url":"value"}' """ try: serializer = IdentifySerializer(data=request.DATA) if serializer.is_valid(): url = request.POST['url'] if str(url).__contains__('?'): registryURl = str(url).split('?')[0] #Check if the OAI Registry is available sickle = Sickle(registryURl) sickle.Identify() http_response = requests.get(url) if http_response.status_code == status.HTTP_200_OK: return Response(http_response.text, status=status.HTTP_200_OK) else: raise OAIAPIException(message='An error occurred.', status=http_response.status_code) else: raise OAIAPIException(message='An error occurred, url malformed.', status=status.HTTP_400_BAD_REQUEST) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except requests.HTTPError, err: content = APIMessage.getMessageLabelled(err.message) return Response(content, status=err.response.status_code)
class TestCase(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCase, self).__init__(methodName) self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') self.assertIsInstance(response.xml, etree._Element) self.assertIsInstance(response.raw, string_types) def test_broken_XML(self): response = self.sickle.harvest(verb='ListRecords', resumptionToken='ListRecordsBroken.xml') self.assertEqual(response.xml, None) self.assertIsInstance(response.raw, string_types) def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): set_iterator = self.sickle.ListSets() sets = [s for s in set_iterator] self.assertEqual(131, len(sets)) dict(sets[0]) def test_ListMetadataFormats(self): mdf_iterator = self.sickle.ListMetadataFormats() mdfs = [mdf for mdf in mdf_iterator] self.assertEqual(5, len(mdfs)) dict(mdfs[0]) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') self.assertEqual(len([r for r in records]), 4) def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) self.assertEqual(num_records, 2) def test_Identify(self): identify = self.sickle.Identify() self.assertTrue(hasattr(identify, 'repositoryName')) self.assertTrue(hasattr(identify, 'baseURL')) self.assertTrue(hasattr(identify, 'adminEmail')) self.assertTrue(hasattr(identify, 'earliestDatestamp')) self.assertTrue(hasattr(identify, 'deletedRecord')) self.assertTrue(hasattr(identify, 'granularity')) self.assertTrue(hasattr(identify, 'description')) self.assertTrue(hasattr(identify, 'oai_identifier')) self.assertTrue(hasattr(identify, 'sampleIdentifier')) dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_noSetHierarchy(self): self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='undefinedError') def test_OAIResponseIterator(self): sickle = Sickle('fake_url', iterator=OAIResponseIterator) records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] self.assertEqual(len(records), 4)
def import_collections(self, resourcesync_sourcedescription, oaipmh_endpoint, collection_keys=None, institution_name=None, resource_dir='resourcesync', overwrite=False): ''' Adds an institution's ResourceSync-able collections to the database. If `collection_keys` is specified, then add to the database only the collections specified by that list. Otherwise, add all collections to the database. Args: resourcesync_sourcedescription: a ResourceSync SourceDescription URL see https://www.openarchives.org/rs/1.1/resourcesync#SourceDesc oaipmh_endpoint: a OAI-PMH base URL see https://www.openarchives.org/OAI/openarchivesprotocol.html#Identify collection_keys: a list of collection keys specifying an exclusive list of collections to add to the database institution_name: human-readable name of the institution which should be used instead of its OAI-PMH repositoryName resource_dir: path to the local directory to store copies of the synced resources to, relative to the home directory "~" overwrite: whether or not to overwrite rows in the database that match the `collection_key` and `institution_key` Returns: None ''' rs_soup = BeautifulSoup( get(resourcesync_sourcedescription).content, 'xml') capabilitylist_urls = [a.string for a in rs_soup.find_all('loc')] sickle = Sickle(oaipmh_endpoint) sets = sickle.ListSets() identify = sickle.Identify() set_spec_to_name = {z.setSpec: z.setName for z in sets} url_map_from = '/'.join(oaipmh_endpoint.split(sep='/')[:-1]) + '/' i_name = institution_name if institution_name is not None else identify.repositoryName has_capability = lambda c, tag: tag.md is not None and 'capability' in tag.md.attrs and tag.md[ 'capability'] == c for capabilitylist_url in capabilitylist_urls: # For now, get setSpec from the path component of the CapabilityList URL (which may have percent-encoded characters) set_spec = urllib.parse.unquote( urllib.parse.urlparse(capabilitylist_url).path.split( sep='/')[2]) # If a subset of collections is specified, only add collections that belong to it. Otherwise, add all collections. if collection_keys is None or (collection_keys is not None and set_spec in collection_keys): r_soup = BeautifulSoup(get(capabilitylist_url).content, 'xml') # ResourceList should always exist, but if it doesn't, log it and skip this collection try: resourcelist_url = r_soup.find( functools.partial(has_capability, 'resourcelist')).loc.string except AttributeError: # TODO: log it pass continue # If no ChangeList exists yet, that's ok; predict what its URL will be try: changelist_url = r_soup.find( functools.partial(has_capability, 'changelist')).loc.string except AttributeError: changelist_url = '/'.join( resourcelist_url.split(sep='/')[:-1] + ['changelist_0000.xml']) print( self.__collection_identifier(i_name, identify.repositoryIdentifier, set_spec_to_name[set_spec], set_spec)) # We can add the collection to the database now # TODO: catch exceptions self.__insert_or_update(identify.repositoryIdentifier, i_name, set_spec, set_spec_to_name[set_spec], resourcelist_url, changelist_url, url_map_from, resource_dir, overwrite)
def coletar_PERIODICO(provedores, nomearquivo): # cria o dataframe para guardar o resultado da coleta resultado = pd.DataFrame( columns=['title', 'creator', 'contributor', 'subject', 'description', 'coverage', 'date', 'format', 'identifier', 'language', 'provider', 'publisher', 'relation', 'rights', 'source', 'type', 'setSpec']) st.write('Iniciando a coleta....') contadorgeral = 0 # conta o total de registros coletados de todos os provedores for n in range(len(provedores['titulo'])): # percorre a planilha dos provedores try: provider = provedores['titulo'][n] # armazena a sigla da instituição url_provider = provedores['url'][n] # armazena a url do provedor st.write('Coletando o provedor : ', provider) # inicializa o provedor sickle = Sickle(url_provider) identify = sickle.Identify() # identifica o provedor e já verifica se está respondendo no endpoint if (identify): sets = sickle.ListSets() for conjuntorevista in sets: conjunto = conjuntorevista.setSpec st.write("Coletando o conjunto: ", conjunto) # Tenta coletar o conjunto de registros especificados # Pode resultar em erro caso o conjunto de registros retorne 0 como resultado try: registros = sickle.ListRecords( **{'metadataPrefix': 'oai_dc', 'set': conjunto, 'from': ano + '-01-01'}) contador = 0 # itera pelo conjunto de registros identificados for registro in registros: contador = contador + 1 # recupera os metadados de cada registro em formato dicionario metadados = registro.metadata # recupera os metadados individualmente. # Os campos podem ser multivalorados. Para isso, é preciso extrair item por item da lista de cada metadado. # Também precisa tratar exceção para caso o metadado não exista no repositório title = 'DADO AUSENTE NO PROVEDOR' creator = 'DADO AUSENTE NO PROVEDOR' contributor = 'DADO AUSENTE NO PROVEDOR' subject = 'DADO AUSENTE NO PROVEDOR' description = 'DADO AUSENTE NO PROVEDOR' coverage = 'DADO AUSENTE NO PROVEDOR' datem = 'DADO AUSENTE NO PROVEDOR' formatm = 'DADO AUSENTE NO PROVEDOR' identifier = 'DADO AUSENTE NO PROVEDOR' language = 'DADO AUSENTE NO PROVEDOR' provider = 'DADO AUSENTE NO PROVEDOR' publisher = 'DADO AUSENTE NO PROVEDOR' relation = 'DADO AUSENTE NO PROVEDOR' rights = 'DADO AUSENTE NO PROVEDOR' source = 'DADO AUSENTE NO PROVEDOR' typem = 'DADO AUSENTE NO PROVEDOR' # METADADO TITLE if 'title' in metadados: i = 0 for titulo in metadados['title']: if i == 0: title = titulo i = i + 1 else: title = title + "||" + titulo # METADADO CREATOR if 'creator' in metadados: i = 0 for criador in metadados['creator']: if i == 0: creator = criador i = i + 1 else: creator = creator + "||" + criador # METADADO CONTRIBUTOR if 'contributor' in metadados: i = 0 for contribuidor in metadados['contributor']: if i == 0: contributor = contribuidor i = i + 1 else: contributor = contributor + "||" + contribuidor # METADADO SUBJECT if 'subject' in metadados: i = 0 for assunto in metadados['subject']: if i == 0: subject = assunto i = i + 1 else: subject = subject + "||" + assunto # METADADO DESCRIPTION if 'description' in metadados: i = 0 for descricao in metadados['description']: if i == 0: description = descricao i = i + 1 else: description = description + "||" + descricao # METADADO COVERAGE if 'coverage' in metadados: i = 0 for cobertura in metadados['coverage']: if i == 0: coverage = cobertura i = i + 1 else: coverage = coverage + "||" + cobertura # METADADO DATE if 'date' in metadados: i = 0 for data in metadados['date']: if i == 0: datem = data i = i + 1 else: datem = datem + "||" + data # METADADO FORMAT if 'format' in metadados: i = 0 for formato in metadados['format']: if i == 0: formatm = formato i = i + 1 else: formatm = format + "||" + formato # METADADO IDENTIFIER if 'identifier' in metadados: i = 0 for ide in metadados['identifier']: if i == 0: identifier = ide i = i + 1 else: identifier = identifier + "||" + ide # METADADO LANGUAGE if 'language' in metadados: i = 0 for lingua in metadados['language']: if i == 0: language = lingua i = i + 1 else: language = language + "||" + lingua # METADADO PROVIDER if 'provider' in metadados: i = 0 for provedor in metadados['provider']: if i == 0: provider = provedor i = i + 1 else: provider = provider + "||" + provedor else: provider = provedores['titulo'][n] # METADADO PUBLISHER if 'publisher' in metadados: i = 0 for publicador in metadados['publisher']: if i == 0: publisher = publicador i = i + 1 else: publisher = publisher + "||" + publicador # METADADO RELATION if 'relation' in metadados: i = 0 for relacao in metadados['relation']: if i == 0: relation = relacao i = i + 1 else: relation = relation + "||" + relacao # METADADO RIGHTS if 'rights' in metadados: i = 0 for direitos in metadados['rights']: if i == 0: rights = direitos i = i + 1 else: rights = rights + "||" + direitos # METADADO SOURCE if 'source' in metadados: i = 0 for fonte in metadados['source']: if i == 0: source = fonte i = i + 1 else: source = source + "||" + fonte # METADADO TYPE if 'type' in metadados: i = 0 for tipo in metadados['type']: if i == 0: typem = tipo i = i + 1 else: typem = typem + "||" + tipo setSpec = conjunto # monta dataframe com os metadados coletados metadadoscoletados = [ [title, creator, contributor, subject, description, coverage, datem, formatm, identifier, language, provider, publisher, relation, rights, source, typem, setSpec]] dadoscoletados = pd.DataFrame(metadadoscoletados, columns=['title', 'creator', 'contributor', 'subject', 'description', 'coverage', 'date', 'format', 'identifier', 'language', 'provider', 'publisher', 'relation', 'rights', 'source', 'type', 'setSpec']) # inclui os resultados no dataframe resultado = pd.concat([resultado, dadoscoletados], sort=False) st.write('Registros coletados : ', contador) contadorgeral = contadorgeral + contador except Exception as e: st.write('Sem atualizações no provedor : ', provider) st.write('**************************************************************') continue st.write('**************************************************************') except Exception as e: print(e) st.write('Erro no provedor') st.write('**************************************************************') continue st.write(resultado['provider'].value_counts()) st.write('TOTAL DE REGISTROS COLETADOS DE TODOS OS PROVEDORES: ', contadorgeral) st.write('************** FIM DA COLETA **************') resultado.to_csv(nomearquivo, index=False)