Ejemplo n.º 1
0
    def create_capabilitylist(self) -> SitemapData:
        """
        ``build step 5`` :samp:`Create a new capabilitylist over sitemaps found in metadata directory`

        :return: :class:`SitemapData` over the newly created capabilitylist
        """
        capabilitylist_path = self.para.abs_metadata_path("capabilitylist.xml")
        if os.path.exists(
                capabilitylist_path) and self.para.is_saving_sitemaps:
            os.remove(capabilitylist_path)

        doc_types = [
            "resourcelist", "changelist", "resourcedump", "changedump"
        ]
        capabilitylist = CapabilityList()
        for doc_type in doc_types:
            index_path = self.para.abs_metadata_path(doc_type + "-index.xml")
            if os.path.exists(index_path):
                capabilitylist.add(
                    Resource(uri=self.para.uri_from_path(index_path),
                             capability=doc_type))
            else:
                doc_list_files = sorted(
                    glob(self.para.abs_metadata_path(doc_type + "_*.xml")))
                for doc_list in doc_list_files:
                    capabilitylist.add(
                        Resource(uri=self.para.uri_from_path(doc_list),
                                 capability=doc_type))

        return self.finish_sitemap(-1, capabilitylist)
Ejemplo n.º 2
0
    def get_change_dump_xml(self, from_date):
        """
        Get change dump xml.

        :return: Updated Change List info
        """
        if not self._validation():
            return None
        change_dump = ChangeDump()
        change_dump.up = '{}resync/capability.xml'.format(request.url_root)
        change_dump.index = '{}resync/{}/changedump.xml'.format(
            request.url_root, self.repository_id)

        record_changes = self._get_record_changes_with_interval(from_date)

        for data in record_changes:
            try:
                next_ch = self._next_change(data, record_changes)
                if data.get('status') == 'deleted':
                    continue
                loc = '{}resync/{}/{}/change_dump_content.zip'.format(
                    request.url_root, self.repository_id,
                    '{}.{}'.format(data.get('record_id'),
                                   data.get('record_version')))

                rc = Resource(loc,
                              lastmod=data.get("updated"),
                              mime_type='application/zip',
                              md_from=data.get('updated'),
                              md_until=datetime.datetime.utcnow().replace(
                                  tzinfo=datetime.timezone.utc).isoformat(),
                              ln=[])
                if next_ch and next_ch.get('updated'):
                    rc.md_until = next_ch.get('updated')
                if self.change_dump_manifest:
                    ln = {
                        'rel':
                        'contents',
                        'href':
                        '{}resync/{}/{}/changedump_manifest.xml'.format(
                            request.url_root, self.repository_id,
                            '{}.{}'.format(data.get('record_id'),
                                           data.get('record_version'))),
                        'type':
                        'application/xml'
                    }
                    rc.ln.append(ln)
                change_dump.add(rc)
            except Exception:
                current_app.logger.error('-' * 60)
                traceback.print_exc(file=sys.stdout)
                current_app.logger.error('-' * 60)
                continue

        return change_dump.as_xml()
    def generate(self) -> [Resource]:

        elastic_page_generator = self.elastic_page_generator()
        erased_changes = False
        for e_page in elastic_page_generator():
            if not erased_changes:
                # this will happen at the first scroll
                self.erase_changes()
                logger.info("Erasing changes")
                erased_changes = True
            for e_hit in e_page:
                e_source = e_hit['_source']

                if self.elastic_params.strategy == Strategy.resourcelist.value:
                    e_doc = ResourceDoc.as_resource_doc(e_source)
                else:
                    e_doc = ChangeDoc.as_change_doc(e_source)

                uri = e_doc.location.uri_from_path(
                    param_url_prefix=self.elastic_params.url_prefix,
                    param_resource_root_dir=self.elastic_params.
                    resource_root_dir)
                if self.elastic_params.strategy == Strategy.resourcelist.value:
                    ln = []
                    if e_doc.ln:
                        for link in e_doc.ln:
                            link_uri = link.href.uri_from_path(
                                param_url_prefix=self.elastic_params.
                                url_prefix,
                                param_resource_root_dir=self.elastic_params.
                                resource_root_dir)
                            ln.append({
                                'href': link_uri,
                                'rel': link.rel,
                                'mime': link.mime
                            })

                    resource = Resource(uri=uri,
                                        length=e_doc.length,
                                        lastmod=e_doc.lastmod,
                                        md5=e_doc.md5,
                                        mime_type=e_doc.mime,
                                        ln=ln)
                else:
                    resource = Resource(uri=uri,
                                        lastmod=e_doc.lastmod,
                                        change=e_doc.change)
                yield resource
Ejemplo n.º 4
0
    def get_resource_dump_manifest(self, record_id):
        """
        Get resource dump manifest.

        :param record_id: Identifier of record.
        :return: (xml) content of resourcedumpmanifest
        """
        _validation = self._validation(record_id)
        if self.resource_dump_manifest and _validation:
            rdm = ResourceDumpManifest()
            rdm.up = '{}resync/{}/resourcedump.xml'.format(
                request.url_root, self.repository_id)
            record = WekoRecord.get_record_by_pid(record_id)
            if record:
                for file in record.files:
                    current_app.logger.debug(file.info())
                    file_info = file.info()
                    path = 'recid_{}/{}'.format(record.get('recid'),
                                                file_info.get('key'))
                    lastmod = str(datetime.datetime.utcnow().replace(
                        tzinfo=datetime.timezone.utc).isoformat())
                    rdm.add(
                        Resource(
                            '{}record/{}/files/{}'.format(
                                request.url_root, record.get('recid'),
                                file_info.get('key')),
                            lastmod=lastmod,
                            sha256=file_info.get('checksum').split(':')[1],
                            length=str(file_info.get('size')),
                            path=path))
            return rdm.as_xml()
        return None
Ejemplo n.º 5
0
    def get_resource_list_xml(self, from_date=None, to_date=None):
        """
        Get content of resource list.

        :return: (xml) resource list content
        """
        if not self._validation():
            return None
        r = get_items_by_index_tree(self.repository_id)

        rl = ResourceList()
        rl.up = INVENIO_CAPABILITY_URL.format(request.url_root)

        for item in r:
            if item:
                resource_date = str_to_datetime(
                    item.get('_source').get('_updated'))
                if from_date and str_to_datetime(from_date) > resource_date:
                    continue
                if to_date and str_to_datetime(to_date) < resource_date:
                    continue
                id_item = item.get('_source').get('control_number')
                # url = '{}records/{}'.format(request.url_root, str(id_item))
                url = '{}resync/{}/records/{}'.format(request.url_root,
                                                      str(self.repository_id),
                                                      str(id_item))
                rl.add(
                    Resource(url, lastmod=item.get('_source').get('_updated')))
        return rl.as_xml()
    def oaipmh_header_to_resourcesync_resource(self, header):
        """Maps an OAI-PMH record identifier to a ResourceSync Resource.

        header              an instance of `sickle.models.Header`
            https://sickle.readthedocs.io/en/latest/api.html#sickle.models.Header
        """

        soup = BeautifulSoup(header.raw.encode('utf-8'), 'xml')
        lastmod = soup.header.datestamp.text
        identifier = soup.identifier.text

        query_string = 'verb=GetRecord&identifier={}&metadataPrefix={}'.format(
            urllib.parse.quote(identifier, safe=''),
            urllib.parse.quote(self.params['oaipmh_metadataprefix'], safe=''))
        parts = urllib.parse.urlparse(
            self.params['oaipmh_base_url'])[:4] + (query_string, '')
        uri = urllib.parse.urlunparse(parts)

        # do a GET request for each record to retrieve the 'content-length'
        r = get(uri)
        length = len(r.content)

        # compute md5 of the GetRecord element (OAI-PMH responses include
        # responseDate tags, so the md5 of the entire response is different for
        # subsequent requests for the same record)
        m = md5()
        element = str(BeautifulSoup(r.content,
                                    'xml').GetRecord).encode('utf-8')
        m.update(element)

        return Resource(uri=uri,
                        lastmod=lastmod,
                        md5=m.hexdigest(),
                        length=length,
                        mime_type="text/xml")
Ejemplo n.º 7
0
    def update_resource_sync(self, capabilitylist_data):
        """
        ``build step 6`` :samp:`Update description with newly created capabilitylist`

        :param capabilitylist_data: :class:`SitemapData` over the newly created capabilitylist
        :return: :class:`SitemapData` over updated description
        """
        src_desc_path = self.para.abs_description_path()
        well_known_dir = os.path.dirname(src_desc_path)
        os.makedirs(well_known_dir, exist_ok=True)

        src_description = SourceDescription()
        if os.path.exists(src_desc_path):
            src_description = self.read_sitemap(src_desc_path, src_description)

        src_description.add(Resource(
            uri=capabilitylist_data.uri,
            capability=Capability.capabilitylist.name),
                            replace=True)
        sitemap_data = SitemapData(len(src_description), -1,
                                   self.para.description_url(), src_desc_path,
                                   Capability.description.name)
        if self.para.is_saving_sitemaps:
            self.save_sitemap(src_description, src_desc_path)
            sitemap_data.document_saved = True

        self.observers_inform(self,
                              ExecutorEvent.completed_document,
                              document=src_description,
                              sitemap_data=sitemap_data)
        return sitemap_data
Ejemplo n.º 8
0
    def create_index(self, sitemap_data_iter: iter) -> SitemapData:
        changelist_index_path = self.param.abs_metadata_path(
            "changelist-index.xml")
        changelist_index_uri = self.param.uri_from_path(changelist_index_path)
        if os.path.exists(changelist_index_path):
            os.remove(changelist_index_path)

        changelist_files = sorted(
            glob(self.param.abs_metadata_path("changelist_*.xml")))
        if len(changelist_files) > 1:
            changelist_index = ChangeList()
            changelist_index.sitemapindex = True
            changelist_index.md_from = self.date_resourcelist_completed
            for cl_file in changelist_files:
                changelist = self.read_sitemap(cl_file, ChangeList())
                uri = self.param.uri_from_path(cl_file)
                changelist_index.resources.append(
                    Resource(uri=uri,
                             md_from=changelist.md_from,
                             md_until=changelist.md_until))

                if self.param.is_saving_sitemaps:
                    index_link = changelist.link("index")
                    if index_link is None:
                        changelist.link_set(rel="index",
                                            href=changelist_index_uri)
                        self.save_sitemap(changelist, cl_file)

            self.finish_sitemap(-1, changelist_index)
Ejemplo n.º 9
0
    def get_change_dump_index(self):
        """
        Delete unregister bucket by pid.

        Arguments:
        Returns:
            None.

        """
        if not self._validation():
            return None
        changedump = ListBaseWithIndex(capability_name='changedump', )
        changedump.up = INVENIO_CAPABILITY_URL.format(request.url_root)
        published_date = self.publish_date or datetime.datetime.utcnow()
        change_date = published_date
        day_now = datetime.datetime.now()

        while change_date < day_now:
            until = change_date + timedelta(days=self.interval_by_date)
            if until > day_now:
                until = day_now
            change = Resource(
                '{}/{}/changedump.xml'.format(self.url_path,
                                              change_date.strftime(r"%Y%m%d")),
                capability='changedump',
                md_from=str(
                    change_date.replace(
                        tzinfo=datetime.timezone.utc).isoformat()),
                md_until=str(
                    until.replace(tzinfo=datetime.timezone.utc).isoformat()))
            changedump.add(change)
            change_date = until
        return changedump.as_xml()
Ejemplo n.º 10
0
        def generator(filenames: iter, count=0) -> [int, Resource]:
            passes_gate = self.resource_gate()
            for filename in filenames:
                if not isinstance(filename, str):
                    LOG.warning("Not a string: %s" % filename)
                    filename = str(filename)

                file = os.path.abspath(filename)
                if not os.path.exists(file):
                    LOG.warning("File does not exist: %s" % file)
                elif os.path.isdir(file):
                    for cr, rsc in generator(self.walk_directories(file), count=count):
                        yield cr, rsc
                        count = cr
                elif os.path.isfile(file):
                    if passes_gate(file):
                        count += 1
                        path = os.path.relpath(file, self.para.resource_dir)
                        uri = self.para.url_prefix + defaults.sanitize_url_path(path)
                        stat = os.stat(file)
                        resource = Resource(uri=uri, length=stat.st_size,
                                            lastmod=defaults.w3c_datetime(stat.st_ctime),
                                            md5=defaults.md5_for_file(file),
                                            mime_type=defaults.mime_type(file))
                        yield count, resource
                        self.observers_inform(self, ExecutorEvent.created_resource, resource=resource,
                                              count=count, file=file)
                    else:
                        self.observers_inform(self, ExecutorEvent.rejected_file, file=file)
                else:
                    LOG.warning("Not a regular file: %s" % file)
Ejemplo n.º 11
0
    def get_capability_content(cls):
        """
        Get capability_list.

        :return: (Resource Obj) list resource dump and resource list
        """
        list_resource = cls.get_list_resource()
        caplist = []
        for resource in list_resource:
            if resource._validation():
                caplist.append(
                    Resource('{}/resourcelist.xml'.format(resource.url_path),
                             capability='resourcelist'))
                caplist.append(
                    Resource('{}/resourcedump.xml'.format(resource.url_path),
                             capability='resourcedump'))
        return caplist
Ejemplo n.º 12
0
    def get_capability_content(cls):
        """
        Get capability list content.

        :return: (Resource Obj) list resource dump and resource list
        """
        list_change = cls.get_all()
        caplist = []
        for change in list_change:
            if change._validation():
                caplist.append(
                    Resource('{}/changelist.xml'.format(change.url_path),
                             capability='changelist'))
                caplist.append(
                    Resource('{}/changedump.xml'.format(change.url_path),
                             capability='changedump'))
        return caplist
Ejemplo n.º 13
0
    def generate(self):

        url = "http://www.resourcesync.org"
        m = md5()
        m.update(url.encode("utf8"))
        rm = Resource(uri=url,
                      lastmod="2016-10-01",
                      md5=m.hexdigest(),
                      length=20,
                      mime_type="application/xml")
        return [rm]
Ejemplo n.º 14
0
def render_well_know_resourcesync():
    """Generate source description xml."""
    cap = ListBaseWithIndex(capability_name='description',
                            ln=[{
                                'href': request.url_root,
                                'rel': 'describedby'
                            }])
    cap.add(
        Resource('{}resync/capability.xml'.format(request.url_root),
                 capability='capability'))

    return cap.as_xml()
Ejemplo n.º 15
0
    def generate(self):

        url = "http://www.resourcesync.org"
        m = md5()
        body = url + "new changes"
        m.update(body.encode("utf8"))
        rm = Resource(
            uri=url,
            lastmod="2017-06-14",
            md5=m.hexdigest(),
            length=len(body),
            mime_type="application/xml"
        )
        return [rm]
Ejemplo n.º 16
0
    def create_index(self, sitemap_data_iter: iter) -> SitemapData:
        changelist_index_path = self.param.abs_metadata_path(
            "changedump-index.xml")
        changelist_index_uri = self.param.uri_from_path(changelist_index_path)
        if os.path.exists(changelist_index_path):
            os.remove(changelist_index_path)

        changelist_files = sorted(
            glob(self.param.abs_metadata_path("changedump_*.xml")))
        changedump_files = sorted(
            glob(self.param.abs_metadata_path("cd_*.zip")))
        if len(changelist_files) > 1:
            # changelist_index = ChangeDumpManifest()
            changelist_index = ChangeDump()
            changelist_index.modified = defaults.w3c_now()
            # changelist_index.sitemapindex = True
            # changelist_index.modified = self.date_resourcelist_completed
            for cl_file, cd_file in zip(changelist_files, changedump_files):
                # changelist = self.read_sitemap(cl_file, ChangeDump(md_from=changelist.md_from, md_until=changelist.md_until))
                changelist = self.read_sitemap(cl_file, ChangeDump())
                uri = self.param.uri_from_path(cd_file)
                lastmod = str(
                    defaults.reformat_datetime(
                        defaults.file_modification_date(cd_file)))
                md5 = defaults.md5_for_file(cd_file)
                mime_type = defaults.mime_type(cd_file)
                cd_length = os.path.getsize(cd_file)
                cd = Resource(uri=uri,
                              length=cd_length,
                              lastmod=lastmod,
                              ln=[{
                                  'rel': 'contents',
                                  'href': cl_file
                              }])
                # changelist_index.resources.add(Resource(uri=uri, length=cd_length, md_from=changelist.md_from,
                changelist_index.add(cd)

                if self.param.is_saving_sitemaps:
                    index_link = changelist.link("index")
                    if index_link is None:
                        changelist.link_set(rel="index",
                                            href=changelist_index_uri)
                        self.save_sitemap(changelist, cl_file)

            self.finish_sitemap(-1, changelist_index)
Ejemplo n.º 17
0
    def get_resource_dump_xml(self, from_date=None, to_date=None):
        """
        Get content of resource dump.

        :return: (xml) resource dump content
        """
        if not self._validation():
            return None

        from .utils import parse_date
        if from_date:
            from_date = parse_date(from_date)
        if to_date:
            to_date = parse_date(to_date)

        r = get_items_by_index_tree(self.repository_id)
        rd = ResourceDump()
        rd.up = INVENIO_CAPABILITY_URL.format(request.url_root)
        for item in r:
            if item:
                resource_date = parse_date(item.get('_source').get('_updated'))
                if from_date and from_date > resource_date:
                    continue
                if to_date and to_date < resource_date:
                    continue
                id_item = item.get('_source').get('control_number')
                url = '{}resync/{}/{}/file_content.zip'.format(
                    request.url_root, self.repository_id, str(id_item))
                rs = Resource(url,
                              lastmod=item.get('_source').get('_updated'),
                              ln=[])
                if self.resource_dump_manifest:
                    href = '{}resync/{}/{}/resourcedump_manifest.xml'.format(
                        request.url_root, self.repository_id, str(id_item))
                    rs.ln.append({
                        'rel': 'contents',
                        'href': href,
                        'type': 'application/xml'
                    })
                rd.add(rs)
        return rd.as_xml()
Ejemplo n.º 18
0
    def create_index(self, sitemap_data_iter: iter):
        if len(sitemap_data_iter) > 1:
            resourcelist_index = ResourceList()
            resourcelist_index.sitemapindex = True
            resourcelist_index.md_at = self.date_start_processing
            resourcelist_index.md_completed = self.date_end_processing
            index_path = self.param.abs_metadata_path("resourcelist-index.xml")
            rel_index_path = os.path.relpath(index_path,
                                             self.param.resource_dir)
            index_url = self.param.url_prefix + defaults.sanitize_url_path(
                rel_index_path)
            resourcelist_index.link_set(rel="up",
                                        href=self.param.capabilitylist_url())

            for sitemap_data in sitemap_data_iter:
                resourcelist_index.add(
                    Resource(uri=sitemap_data.uri,
                             md_at=sitemap_data.doc_start,
                             md_completed=sitemap_data.doc_end))
                if sitemap_data.document_saved:
                    self.update_rel_index(index_url, sitemap_data.path)

            self.finish_sitemap(-1, resourcelist_index)
Ejemplo n.º 19
0
    def solr_results_to_resourcesync_resource(self, a_result):

        if not self.params['metadata_disseminator'] == '':
            uri = self.params['metadata_disseminator'].replace(
                '_ID_', a_result['id'])
        else:
            uri = a_result['id']
        # self.params['metadata_type'])
        lastmod = a_result['timestamp']

        # do a GET request for each record to retrieve the 'content-length'
        r = get(uri)
        length = len(r.content)

        # compute md5 of the metadata record
        m = md5()
        element = str(r.content).encode('utf-8')
        m.update(element)

        return Resource(uri=uri,
                        lastmod=lastmod,
                        md5=m.hexdigest(),
                        length=length,
                        mime_type="text/xml")
Ejemplo n.º 20
0
    def get_change_list_content_xml(self,
                                    from_date,
                                    from_date_args=None,
                                    to_date_args=None):
        """
        Get change list xml.

        :return: Updated Change List info
        """
        if not self._validation():
            return None

        from .utils import parse_date
        if from_date_args:
            from_date_args = parse_date(from_date_args)
        if to_date_args:
            to_date_args = parse_date(to_date_args)

        change_list = ChangeList()
        change_list.up = INVENIO_CAPABILITY_URL.format(request.url_root)
        change_list.index = '{}resync/{}/changelist.xml'.format(
            request.url_root,
            self.repository_id,
        )

        record_changes = self._get_record_changes_with_interval(from_date)

        for data in record_changes:
            try:
                if from_date_args and from_date_args > parse_date(
                        data.get("updated")):
                    continue
                if to_date_args and to_date_args < parse_date(
                        data.get("updated")):
                    continue
                pid_object = PersistentIdentifier.get('recid',
                                                      data.get('record_id'))
                latest_pid = PIDVersioning(child=pid_object).last_child
                is_latest = str(latest_pid.pid_value) == "{}.{}".format(
                    data.get('record_id'), data.get('record_version'))
                if not is_latest and data.get('status') != 'deleted':
                    loc = '{}resync/{}/records/{}'.format(
                        request.url_root, self.repository_id,
                        '{}.{}'.format(data.get('record_id'),
                                       data.get('record_version')))
                else:
                    loc = '{}resync/{}/records/{}'.format(
                        request.url_root, self.repository_id,
                        data.get('record_id'))
                rc = Resource(
                    loc,
                    lastmod=data.get("updated"),
                    change=data.get('status'),
                    md_at=data.get("updated"),
                )
                change_list.add(rc)
            except Exception:
                current_app.logger.error('-' * 60)
                traceback.print_exc(file=sys.stdout)
                current_app.logger.error('-' * 60)
                continue

        return change_list.as_xml()
Ejemplo n.º 21
0
    def get_change_dump_manifest_xml(self, record_id):
        """Get change dump manifest xml.

        :param record_id: Identifier of record
        :return xml
        """
        if not self._is_record_in_index(record_id) or not self._validation():
            return None
        cdm = ChangeDumpManifest()
        cdm.up = '{}resync/{}/changedump.xml'.format(request.url_root,
                                                     self.repository_id)
        if self.change_dump_manifest:
            prev_id, prev_ver_id = record_id.split(".")
            current_record = WekoRecord.get_record_by_pid(record_id)
            from .utils import get_pid
            prev_record_pid = get_pid('{}.{}'.format(prev_id,
                                                     str(int(prev_ver_id) -
                                                         1)))
            if prev_record_pid:
                prev_record = WekoRecord.get_record(
                    id_=prev_record_pid.object_uuid)
            else:
                prev_record = None
            if current_record:
                list_file = [file for file in current_record.files]
                current_checksum = [
                    file.info().get('checksum')
                    for file in current_record.files
                ]
                prev_checksum = []
                if prev_record:
                    list_file.extend([file for file in prev_record.files])
                    prev_checksum = [
                        file.info().get('checksum')
                        for file in prev_record.files
                    ]
                for file in list_file:
                    file_info = file.info()
                    change = None
                    if file_info.get('checksum') in prev_checksum:
                        if file_info.get('checksum') in current_checksum:
                            change = None
                        if file_info.get('checksum') not in current_checksum:
                            change = 'deleted'
                    else:
                        if file_info.get('checksum') in current_checksum:
                            change = 'created'
                    path = 'recid_{}/{}'.format(current_record.get('recid'),
                                                file_info.get('key'))
                    lastmod = str(datetime.datetime.utcnow().replace(
                        tzinfo=datetime.timezone.utc).isoformat())
                    if change:
                        re = Resource(
                            '{}record/{}/files/{}'.format(
                                request.url_root, current_record.get('recid'),
                                file_info.get('key')),
                            lastmod=lastmod,
                            sha256=file_info.get('checksum').split(':')[1],
                            length=str(file_info.get('size')),
                            path=path if change != 'delete' else '',
                            change=change)
                        cdm.add(re)
        return cdm.as_xml()
#!/usr/bin/env python
if (True):  #keep indentation of README

    from resync import Resource, ResourceList

    rl = ResourceList()
    rl.add(Resource('http://example.com/res1', lastmod='2013-01-01'))
    rl.add(Resource('http://example.com/res2', lastmod='2013-01-02'))
    print rl.as_xml()