Exemple #1
0
 def test_override_encoding(self):
     mock_response = Mock(text='<xml/>')
     mock_get = Mock(return_value=mock_response)
     with patch('sickle.app.requests.get', mock_get):
         sickle = Sickle('url', encoding='encoding')
         sickle.ListSets()
         self.assertEqual(mock_response.encoding, 'encoding')
Exemple #2
0
    def test_list_oai_collections(self, community):
        """ Constructs list of tuples of collections (a seconday grouping concept
        in OAI) "owned" by the given community.
        
        Utilizes OAI-PMH verbs: ListIdentifiers and ListSets
        """
        sickle = Sickle(community.repository.base_url)

        # Retrieve collections associated with community parameter
        record_headers = sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                                set=community.identifier)
        # Filter record headers to build collection map from the community
        community_collections = {}
        for i in record_headers:
            # Iterate over associated sets looking for collections
            for j in i.setSpecs:
                if j[:3] == 'col':
                    community_collections[
                        j] = None  # register collection id in map

        # Map names to ids in collection map {setSpec: setName}
        # listsets oai request returns the 'setName' of the collection in metadata...
        for i in sickle.ListSets():
            modstr = 'col' + i.setSpec[
                3:]  # Bug in oai? in set results a 'collection' has a prefix of 'com'!
            if modstr in community_collections:  # checks for a mapped collection identifier
                community_collections[modstr] = i.setName

        # Convert map to list of tuples
        self.collections = community_collections.items()

        # Sort collections by name
        self.collections = sorted(self.collections, key=lambda i: i[1])
        return self.collections
Exemple #3
0
 def _get_random_configuration(self):
     self.servers = TestDynamicListRecords._get_servers()
     test_key = 'TEST'
     configuration = {'contexts': {test_key: {}}}
     servers = random.choices(self.servers,
                              k=random.randint(1, min(3,
                                                      len(self.servers))))
     i = 0
     for server in servers:
         server_key = 'SERVER' + str(i)
         configuration['contexts'][test_key][server_key] = {'url': server}
         set_names = []
         server_sickle = Sickle(server)
         try:
             for set_name in server_sickle.ListSets():
                 if len(set_names) == 20:
                     break
                 set_names.append(set_name.setSpec)
         except:
             continue
         sets = set(
             random.choices(set_names,
                            k=random.randint(0, min(20, len(set_names)))))
         if len(sets) > 0:
             configuration['contexts'][test_key][server_key]['sets'] = []
         for set_name in sets:
             configuration['contexts'][test_key][server_key]['sets'].append(
                 set_name)
         i += 1
     return configuration
Exemple #4
0
def list_sets_with_counts(repository_url):
    sickle = Sickle(repository_url)
    setlist = []
    listsets = sickle.ListSets()

    try:
        for i in range(500):
            s = listsets.next()
            #identifiers = get_identifiers_in_set(s.setSpec)
            cnt = 'Ha' #len(identifiers)

            set_identifiers = list_identifiers(s.setSpec)

            setlist.append(
                {
                    'setSpec': s.setSpec,
                    'setName': s.setName,
                    'set_identifiers': set_identifiers,
                }
            )
    except StopIteration:
        pass
    except Exception as e:
        abort(400, e)

    return setlist
Exemple #5
0
 def test_override_encoding(self):
     mock_response = Mock(text='<xml/>', content='<xml/>', status_code=200)
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url', encoding='encoding')
         sickle.ListSets()
         mock_get.assert_called_once_with('url',
                                          params={'verb': 'ListSets'})
Exemple #6
0
    def list_oai_community_sets(self, repository):
        """ Contructs list of tuples of communities (a grouping concept in OAI) 
        for the given repository.
        Utilizes OAI-PMH verb: ListSets
        """
        try:
            sickle = Sickle(repository.base_url)
            sets = sickle.ListSets()
        except:
            return
        """ Filter set list to build list of community sets """
        for i in sets:
            """ Build community tuples (id, human readable name) """
            if i.setSpec[:3] == 'com':
                set_data = (i.setSpec, i.setName)
                self.communities.append(set_data)

        self.communities = sorted(self.communities, key=lambda i: i[1])
Exemple #7
0
def list_sets(repository_url=None):
    repository_url = repository_url or admin.get_repository_url()
    sickle = Sickle(repository_url)
    setlist = []
    listsets = sickle.ListSets()

    try:
        for i in range(500):
            s = listsets.next()
            setlist.append(
                {
                    'setSpec': s.setSpec,
                    'setName': s.setName,
                }
            )
    except StopIteration:
        pass
    except Exception as e:
        abort(400, e)

    return setlist
class TestCase(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                              ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
Exemple #9
0
    def import_collections(self,
                           resourcesync_sourcedescription,
                           oaipmh_endpoint,
                           collection_keys=None,
                           institution_name=None,
                           resource_dir='resourcesync',
                           overwrite=False):
        '''
        Adds an institution's ResourceSync-able collections to the database.

        If `collection_keys` is specified, then add to the database only
        the collections specified by that list. Otherwise, add all collections 
        to the database.

        Args:
          resourcesync_sourcedescription: a ResourceSync SourceDescription URL
              see https://www.openarchives.org/rs/1.1/resourcesync#SourceDesc
          oaipmh_endpoint: a OAI-PMH base URL
              see https://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
          collection_keys: a list of collection keys specifying an
              exclusive list of collections to add to the database
          institution_name: human-readable name of the institution which
              should be used instead of its OAI-PMH repositoryName
          resource_dir: path to the local directory to store copies of the
              synced resources to, relative to the home directory "~"
          overwrite: whether or not to overwrite rows in the database that
              match the `collection_key` and `institution_key`

        Returns:
          None
        '''
        rs_soup = BeautifulSoup(
            get(resourcesync_sourcedescription).content, 'xml')
        capabilitylist_urls = [a.string for a in rs_soup.find_all('loc')]

        sickle = Sickle(oaipmh_endpoint)
        sets = sickle.ListSets()
        identify = sickle.Identify()

        set_spec_to_name = {z.setSpec: z.setName for z in sets}
        url_map_from = '/'.join(oaipmh_endpoint.split(sep='/')[:-1]) + '/'

        i_name = institution_name if institution_name is not None else identify.repositoryName

        has_capability = lambda c, tag: tag.md is not None and 'capability' in tag.md.attrs and tag.md[
            'capability'] == c

        for capabilitylist_url in capabilitylist_urls:

            # For now, get setSpec from the path component of the CapabilityList URL (which may have percent-encoded characters)
            set_spec = urllib.parse.unquote(
                urllib.parse.urlparse(capabilitylist_url).path.split(
                    sep='/')[2])

            # If a subset of collections is specified, only add collections that belong to it. Otherwise, add all collections.
            if collection_keys is None or (collection_keys is not None
                                           and set_spec in collection_keys):

                r_soup = BeautifulSoup(get(capabilitylist_url).content, 'xml')

                # ResourceList should always exist, but if it doesn't, log it and skip this collection
                try:
                    resourcelist_url = r_soup.find(
                        functools.partial(has_capability,
                                          'resourcelist')).loc.string
                except AttributeError:
                    # TODO: log it
                    pass
                    continue

                # If no ChangeList exists yet, that's ok; predict what its URL will be
                try:
                    changelist_url = r_soup.find(
                        functools.partial(has_capability,
                                          'changelist')).loc.string
                except AttributeError:
                    changelist_url = '/'.join(
                        resourcelist_url.split(sep='/')[:-1] +
                        ['changelist_0000.xml'])

                print(
                    self.__collection_identifier(i_name,
                                                 identify.repositoryIdentifier,
                                                 set_spec_to_name[set_spec],
                                                 set_spec))

                # We can add the collection to the database now
                # TODO: catch exceptions
                self.__insert_or_update(identify.repositoryIdentifier, i_name,
                                        set_spec, set_spec_to_name[set_spec],
                                        resourcelist_url, changelist_url,
                                        url_map_from, resource_dir, overwrite)
Exemple #10
0
class OAIHarvester(object):
    """Downloads files from a OAI-PMH 2.0 API and stores them as xml."""

    def __init__(self, base_url: str, metadata_prefix: str, path: str,
                 base_file_name='harvest-result', user='', password='',
                 logger=logging.getLogger('oai'), encoding='iso-8859-1'):
        """
        Configure a basic connection to the OAI-Server. Sets up the sickle instance with appropriate settings
        and checks if the metadata prefix is valid. Creates a directory at path if no such path exists.

        :param base_url:        Base url for the oai request without http://
        :param metadata_prefix:  Metadata-Prefix for the api_response to be harvested.
        :param path:            Directory path where the files should be stored.
        :param base_file_name:  Downloads are saved in this file. If several downloads are made the resumption token
                                or a random number is added.
        :param user:            User name for basic http authentication (unescaped)
        :param password:        Password for basic http authentication (unescaped)
        :param logger:          Logger used to log all actions and errors of this class.
        :param encoding:        The encoding used to store elements

        :raises InvalidPrefixError if the given prefix is not valid.
        """
        self.encoding = encoding
        self.logger = logger
        self.use_authentication = False
        if user != '':
            assert password != ''
            self.user = urllib.parse.quote(user)
            self.encoded_password = urllib.parse.quote(password)
            self.use_authentication = True
            self.logger.info('Uses authentication with credentials: user: %s, password: %s.',
                             self.user, self.encoded_password)
        else:
            self.logger.info('No authentication given.')

        self.url = base_url
        self.path = path
        self.base_file_name = base_file_name
        self.metadataPrefix = metadata_prefix
        self.api_response = None
        self.data = list()

        if self.use_authentication:
            self.sickle = Sickle('https://' + self.user + ':' + self.encoded_password + '@' + self.url,
                                 iterator=OAIResponseIterator)
        else:
            self.sickle = Sickle('https://' + self.url, iterator=OAIResponseIterator)

        self._verify_metadata_prefix()

        if not os.path.exists(self.path):
            self.logger.info('Create directory at %s.', self.path)
            os.makedirs(self.path)

    def _verify_metadata_prefix(self):
        """
        Verifies that the used metadata prefix is valid for this OAI repository.

        :raises InvalidPrefixError  if the given prefix is not valid.
        """
        # changes the sickle iterator to item to easily access metadata prefix.
        self.sickle.iterator = OAIItemIterator
        valid_prefix_list = list()
        metadata = self.sickle.ListMetadataFormats()
        is_valid_prefix = False
        while True:
            try:
                prefix = metadata.next().metadataPrefix
            except StopIteration:
                break
            valid_prefix_list.append(prefix)
            if prefix == self.metadataPrefix:
                is_valid_prefix = True

        if not is_valid_prefix:
            self.logger.critical('Given metadata prefix (%s) was not valid. Select one of these: %s',
                                 self.metadataPrefix, str(valid_prefix_list))
            raise InvalidPrefixError('Invalid metadataPrefix: ' + self.metadataPrefix + '.\n' +
                                     ' A list of the available prefixes: ' + str(valid_prefix_list))
        else:
            self.logger.info('The prefix given is valid.')

    def store_records(self, set_id=None, date=None, ignore_deleted=False):
        """
        Downloads all records found on the OAI-API or all records from a given set.

        :param set_id:          determine what set to download if a given set should be downloaded (default None)
        :type set_id:           str
        :param date:            Only records added/changed after this date will be downloaded (default None)
        :type date:             str 'YYYY-MM-DD'
        :param ignore_deleted:  When true ignores all deleted records. This may not be a
                                feature available in all OAI archives.
        :type ignore_deleted    bool
        """
        self.sickle.iterator = OAIResponseIterator
        params = {'metadataPrefix': self.metadataPrefix, 'from': date, 'set': set_id, 'ignore_deleted': ignore_deleted}
        self.api_response = self.sickle.ListRecords(**params)
        self._write_all_records()

    def store_record(self, identifier: int):
        """
        Downloads a single record with the given id and stores it in a file at the given place.

        :param identifier: the id which should be retrieved.
        """
        self.sickle.iterator = OAIResponseIterator
        record = self.sickle.GetRecord(identifier=identifier, metadataPrefix=self.metadataPrefix)
        temp_xml = record.raw
        with open(self.path + self.base_file_name + str(identifier) + '.xml', 'w', encoding=self.encoding) as file:
            file.write(temp_xml)

    def iterate_sets(self):
        """Iterate through all sets available at the OAI repository.

        :return List of all sets as tupels (id, name)
        :rtype: iterator tuple (str, str)
        """
        self.sickle.iterator = OAIItemIterator
        try:
            sets = self.sickle.ListSets()
            for s in sets:
                yield (s.setSpec, s.setName)
        except NoSetHierarchy as error:
            self.logger.warning(str(error))
            raise NoSetHierarchy(error)

    def _write_all_records(self):
        """Writes all downloaded api_response into xml files."""
        if self.api_response is None:
            self.logger.critical('No response loaded.')
            raise Exception('No response loaded.')
        record = self.api_response.next()
        last_count = 0
        while record:
            temp_xml = record.raw
            if isinstance(temp_xml, str):
                root = ElementTree.fromstring(temp_xml)
                self.data.append(root)

                download_count = len(root[2]) - 1
                last_count += download_count
                token = root[2][-1]
                total = 0
                file = None
                try:
                    file = open(self.path + self.base_file_name + '-' + token.text + '.xml', 'w',
                                encoding=self.encoding)
                    total = int(root[2][-1].get('completeListSize'))
                    self.logger.info('Downloaded %s records from repository. Still %s to go.',
                                     download_count, total - last_count)
                    file.write(temp_xml)
                    record = self.api_response.next()
                except TypeError:  # no resumption token found.
                    file = open(self.path + self.base_file_name + '-' + str(random.randrange(100000)) + '.xml', 'w',
                                encoding=self.encoding)
                    self.logger.info('No resumption token found. Stopping Download. '
                                     'Downloaded %s from this repository.', total)
                    file.write(temp_xml)
                    record = None
                except (BadArgument, BadResumptionToken) as error:
                    self.logger.critical('Stopped Download: "%s"', str(error))
                    record = None
                finally:
                    if file is not None:
                        file.close()
Exemple #11
0
# com = Community.objects.all()[0]

# oai = OAIUtils()
# oai.list_oai_collections(com)

base_url = 'http://scholarspace.manoa.hawaii.edu/dspace-oai/request'
llt_id = 'com_10125_27123'

s = Sickle(base_url)

record_headers = list(s.ListIdentifiers(metadataPrefix='oai_dc', set=llt_id))

community_collections = {}
for i in record_headers:
    # Iterate over associated sets looking for collections
    for j in i.setSpecs:
        if j[:3] == 'col':
            community_collections[j] = None  # register id in map

for i in s.ListSets():
    try:
        print community_collections[i.setSpec]
        community_collections[i.setSpec] = i.setName
        print i.setSpec, '==>', community_collections[i.setSpec]
        print i
    except KeyError as e:
        pass
        # print e, 'not a collection in llt ...'

sample = 'oai:scholarspace.manoa.hawaii.edu:10125/54329'
s.GetRecord(identifier=sample, metadataPrefix='oai_dc')
Exemple #12
0
def coletar_PERIODICO(provedores, nomearquivo):
    # cria o dataframe para guardar o resultado da coleta
    resultado = pd.DataFrame(
        columns=['title', 'creator', 'contributor', 'subject', 'description', 'coverage', 'date', 'format',
                 'identifier',
                 'language', 'provider', 'publisher', 'relation', 'rights', 'source', 'type', 'setSpec'])

    st.write('Iniciando a coleta....')
    contadorgeral = 0  # conta o total de registros coletados de todos os provedores

    for n in range(len(provedores['titulo'])):  # percorre a planilha dos provedores

        try:
            provider = provedores['titulo'][n]  # armazena a sigla da instituição
            url_provider = provedores['url'][n]  # armazena a url do provedor

            st.write('Coletando o provedor : ', provider)

            # inicializa o provedor
            sickle = Sickle(url_provider)
            identify = sickle.Identify()  # identifica o provedor e já verifica se está respondendo no endpoint
            if (identify):

                sets = sickle.ListSets()
                for conjuntorevista in sets:
                    conjunto = conjuntorevista.setSpec

                    st.write("Coletando o conjunto: ", conjunto)

                    # Tenta coletar o conjunto de registros especificados
                    # Pode resultar em erro caso o conjunto de registros retorne 0 como resultado
                    try:
                        registros = sickle.ListRecords(
                            **{'metadataPrefix': 'oai_dc', 'set': conjunto, 'from': ano + '-01-01'})
                        contador = 0

                        # itera pelo conjunto de registros identificados
                        for registro in registros:
                            contador = contador + 1

                            # recupera os metadados de cada registro em formato dicionario
                            metadados = registro.metadata

                            # recupera os metadados individualmente.
                            # Os campos podem ser multivalorados. Para isso, é preciso extrair item por item da lista de cada metadado.
                            # Também precisa tratar exceção para caso o metadado não exista no repositório
                            title = 'DADO AUSENTE NO PROVEDOR'
                            creator = 'DADO AUSENTE NO PROVEDOR'
                            contributor = 'DADO AUSENTE NO PROVEDOR'
                            subject = 'DADO AUSENTE NO PROVEDOR'
                            description = 'DADO AUSENTE NO PROVEDOR'
                            coverage = 'DADO AUSENTE NO PROVEDOR'
                            datem = 'DADO AUSENTE NO PROVEDOR'
                            formatm = 'DADO AUSENTE NO PROVEDOR'
                            identifier = 'DADO AUSENTE NO PROVEDOR'
                            language = 'DADO AUSENTE NO PROVEDOR'
                            provider = 'DADO AUSENTE NO PROVEDOR'
                            publisher = 'DADO AUSENTE NO PROVEDOR'
                            relation = 'DADO AUSENTE NO PROVEDOR'
                            rights = 'DADO AUSENTE NO PROVEDOR'
                            source = 'DADO AUSENTE NO PROVEDOR'
                            typem = 'DADO AUSENTE NO PROVEDOR'

                            # METADADO TITLE
                            if 'title' in metadados:
                                i = 0
                                for titulo in metadados['title']:
                                    if i == 0:
                                        title = titulo
                                        i = i + 1
                                    else:
                                        title = title + "||" + titulo

                            # METADADO CREATOR
                            if 'creator' in metadados:
                                i = 0
                                for criador in metadados['creator']:
                                    if i == 0:
                                        creator = criador
                                        i = i + 1
                                    else:
                                        creator = creator + "||" + criador

                            # METADADO CONTRIBUTOR
                            if 'contributor' in metadados:
                                i = 0
                                for contribuidor in metadados['contributor']:
                                    if i == 0:
                                        contributor = contribuidor
                                        i = i + 1
                                    else:
                                        contributor = contributor + "||" + contribuidor

                            # METADADO SUBJECT
                            if 'subject' in metadados:
                                i = 0
                                for assunto in metadados['subject']:
                                    if i == 0:
                                        subject = assunto
                                        i = i + 1
                                    else:
                                        subject = subject + "||" + assunto

                            # METADADO DESCRIPTION
                            if 'description' in metadados:
                                i = 0
                                for descricao in metadados['description']:
                                    if i == 0:
                                        description = descricao
                                        i = i + 1
                                    else:
                                        description = description + "||" + descricao

                            # METADADO COVERAGE
                            if 'coverage' in metadados:
                                i = 0
                                for cobertura in metadados['coverage']:
                                    if i == 0:
                                        coverage = cobertura
                                        i = i + 1
                                    else:
                                        coverage = coverage + "||" + cobertura

                            # METADADO DATE
                            if 'date' in metadados:
                                i = 0
                                for data in metadados['date']:
                                    if i == 0:
                                        datem = data
                                        i = i + 1
                                    else:
                                        datem = datem + "||" + data

                            # METADADO FORMAT
                            if 'format' in metadados:
                                i = 0
                                for formato in metadados['format']:
                                    if i == 0:
                                        formatm = formato
                                        i = i + 1
                                    else:
                                        formatm = format + "||" + formato

                            # METADADO IDENTIFIER
                            if 'identifier' in metadados:
                                i = 0
                                for ide in metadados['identifier']:
                                    if i == 0:
                                        identifier = ide
                                        i = i + 1
                                    else:
                                        identifier = identifier + "||" + ide

                            # METADADO LANGUAGE
                            if 'language' in metadados:
                                i = 0
                                for lingua in metadados['language']:
                                    if i == 0:
                                        language = lingua
                                        i = i + 1
                                    else:
                                        language = language + "||" + lingua

                            # METADADO PROVIDER
                            if 'provider' in metadados:
                                i = 0
                                for provedor in metadados['provider']:
                                    if i == 0:
                                        provider = provedor
                                        i = i + 1
                                    else:
                                        provider = provider + "||" + provedor
                            else:
                                provider = provedores['titulo'][n]

                            # METADADO PUBLISHER
                            if 'publisher' in metadados:
                                i = 0
                                for publicador in metadados['publisher']:
                                    if i == 0:
                                        publisher = publicador
                                        i = i + 1
                                    else:
                                        publisher = publisher + "||" + publicador

                            # METADADO RELATION
                            if 'relation' in metadados:
                                i = 0
                                for relacao in metadados['relation']:
                                    if i == 0:
                                        relation = relacao
                                        i = i + 1
                                    else:
                                        relation = relation + "||" + relacao

                            # METADADO RIGHTS
                            if 'rights' in metadados:
                                i = 0
                                for direitos in metadados['rights']:
                                    if i == 0:
                                        rights = direitos
                                        i = i + 1
                                    else:
                                        rights = rights + "||" + direitos

                            # METADADO SOURCE
                            if 'source' in metadados:
                                i = 0
                                for fonte in metadados['source']:
                                    if i == 0:
                                        source = fonte
                                        i = i + 1
                                    else:
                                        source = source + "||" + fonte

                            # METADADO TYPE
                            if 'type' in metadados:
                                i = 0
                                for tipo in metadados['type']:
                                    if i == 0:
                                        typem = tipo
                                        i = i + 1
                                    else:
                                        typem = typem + "||" + tipo

                            setSpec = conjunto

                            # monta dataframe com os metadados coletados
                            metadadoscoletados = [
                                [title, creator, contributor, subject, description, coverage, datem, formatm,
                                 identifier, language, provider, publisher, relation, rights, source, typem, setSpec]]

                            dadoscoletados = pd.DataFrame(metadadoscoletados,
                                                          columns=['title', 'creator', 'contributor', 'subject',
                                                                   'description', 'coverage', 'date', 'format',
                                                                   'identifier', 'language', 'provider', 'publisher',
                                                                   'relation', 'rights', 'source', 'type', 'setSpec'])

                            # inclui os resultados no dataframe
                            resultado = pd.concat([resultado, dadoscoletados], sort=False)

                        st.write('Registros coletados : ', contador)

                        contadorgeral = contadorgeral + contador

                    except Exception as e:
                        st.write('Sem atualizações no provedor : ', provider)
                        st.write('**************************************************************')
                        continue

            st.write('**************************************************************')

        except Exception as e:
            print(e)
            st.write('Erro no provedor')
            st.write('**************************************************************')
            continue

    st.write(resultado['provider'].value_counts())
    st.write('TOTAL DE REGISTROS COLETADOS DE TODOS OS PROVEDORES: ', contadorgeral)
    st.write('************** FIM DA COLETA **************')

    resultado.to_csv(nomearquivo, index=False)