Ejemplo n.º 1
0
 def test01_as_xml(self):
     rd = ResourceDump()
     rd.add( Resource('a.zip',timestamp=1) )
     rd.add( Resource('b.zip',timestamp=2) )
     xml = rd.as_xml()
     self.assertTrue( re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability' )
     self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' ) 
Ejemplo n.º 2
0
    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" %
                                   (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(
            self.publish_dir)).rstrip('\n')

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
Ejemplo n.º 3
0
 def test_build_ex_04(self):
     """Simple Resource Dump document """
     rd = ResourceDump()
     rd.md_at = '2013-01-03T09:00:00Z'
     rd.add( Resource(uri='http://example.com/resourcedump.zip',
                      lastmod='2013-01-03T09:00:00Z') )
     ex_xml = self._open_ex('resourcesync_ex_4').read()
     self._assert_xml_equal( rd.as_xml(), ex_xml )
Ejemplo n.º 4
0
    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n")

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
Ejemplo n.º 5
0
 def test01_as_xml(self):
     rd = ResourceDump()
     rd.add(Resource('a.zip', timestamp=1))
     rd.add(Resource('b.zip', timestamp=2))
     xml = rd.as_xml()
     self.assertTrue(re.search(r'<rs:md .*capability="resourcedump"', xml),
                     'XML has capability')
     self.assertTrue(
         re.search(
             r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>',
             xml), 'XML has resource a')
Ejemplo n.º 6
0
 def test05_write(self):
     rd = ResourceDump()
     rd.add(Resource('aa.zip', timestamp=1))
     rd.add(Resource('bb.zip', timestamp=2))
     dumpf = os.path.join(self.tmpdir, "test05_dump.xml")
     rd.write(basename=dumpf)
     self.assertTrue(os.path.exists(dumpf))
     # Now read that back
     rd2 = ResourceDump()
     rd2.parse(dumpf)
     self.assertEqual(len(rd2), 2)
     self.assertEqual(rd2.uris(), ['aa.zip', 'bb.zip'])
Ejemplo n.º 7
0
    def get_resource_dump_xml(self, from_date=None, to_date=None):
        """
        Get content of resource dump.

        :return: (xml) resource dump content
        """
        if not self._validation():
            return None

        from .utils import parse_date
        if from_date:
            from_date = parse_date(from_date)
        if to_date:
            to_date = parse_date(to_date)

        r = get_items_by_index_tree(self.repository_id)
        rd = ResourceDump()
        rd.up = INVENIO_CAPABILITY_URL.format(request.url_root)
        for item in r:
            if item:
                resource_date = parse_date(item.get('_source').get('_updated'))
                if from_date and from_date > resource_date:
                    continue
                if to_date and to_date < resource_date:
                    continue
                id_item = item.get('_source').get('control_number')
                url = '{}resync/{}/{}/file_content.zip'.format(
                    request.url_root, self.repository_id, str(id_item))
                rs = Resource(url,
                              lastmod=item.get('_source').get('_updated'),
                              ln=[])
                if self.resource_dump_manifest:
                    href = '{}resync/{}/{}/resourcedump_manifest.xml'.format(
                        request.url_root, self.repository_id, str(id_item))
                    rs.ln.append({
                        'rel': 'contents',
                        'href': href,
                        'type': 'application/xml'
                    })
                rd.add(rs)
        return rd.as_xml()
Ejemplo n.º 8
0
 def test_build_ex_17(self):
     """Resource Dump with 3 entries and some metadata"""
     rd = ResourceDump()
     rd.up='http://example.com/dataset1/capabilitylist.xml'
     rd.md_at="2013-01-03T09:00:00Z"
     rd.md_completed="2013-01-03T09:04:00Z"
     z1 = Resource( uri='http://example.com/resourcedump-part1.zip',
                    mime_type="application/zip",
                    length=4765,
                    md_at="2013-01-03T09:00:00Z",
                    md_completed="2013-01-03T09:02:00Z" )
     z1.link_set( rel="contents",
                  href="http://example.com/resourcedump_manifest-part1.xml",
                  mime_type="application/xml" )
     rd.add( z1 )
     z2 = Resource( uri='http://example.com/resourcedump-part2.zip',
                    mime_type="application/zip",
                    length=9875,
                    md_at="2013-01-03T09:01:00Z",
                    md_completed="2013-01-03T09:03:00Z" )
     z2.link_set( rel="contents",
                  href="http://example.com/resourcedump_manifest-part2.xml",
                  mime_type="application/xml" )
     rd.add( z2 )
     z3 = Resource( uri='http://example.com/resourcedump-part3.zip',
                    mime_type="application/zip",
                    length=2298,
                    md_at="2013-01-03T09:03:00Z",
                    md_completed="2013-01-03T09:04:00Z" )
     z3.link_set( rel="contents",
                  href="http://example.com/resourcedump_manifest-part3.xml",
                  mime_type="application/xml" )
     rd.add( z3 )
     ex_xml = self._open_ex('resourcesync_ex_17').read()
     self._assert_xml_equal( rd.as_xml(), ex_xml )
Ejemplo n.º 9
0
    def do_publish(self):
        """
        Publish resources found in resource_dir in accordance with the Resource Sync Framework.
        Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file
        is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources
        have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder
        of resources are packaged in a zip file marked as zip end.

        WARNING: This method removes resources that are published in packages marked as complete from resource_dir.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        count_def_resources = 0
        diff_end_resources = 0
        path_zip_end_old, rl_end_old = self.get_state_published()

        new_zips = ResourceDump()
        state_changed = False
        exhausted = False

        while not exhausted:
            resourcelist, exhausted = self.list_resources_chunk()

            if len(resourcelist) == self.max_files_compressed:  # complete zip
                state_changed = True
                count_def_resources += len(resourcelist)
                zip_resource = self.create_zip(resourcelist,
                                               PREFIX_COMPLETED_PART, False,
                                               self.write_separate_manifest)
                new_zips.add(zip_resource)
                # move resources from resource_dir
                for resource in resourcelist:
                    r_path = os.path.join(self.resource_dir, resource.path)
                    if self.move_resources:
                        shutil.move(r_path, self.publish_dir)
                    else:
                        os.remove(r_path)
            elif not self.is_same(resourcelist, rl_end_old):
                assert exhausted
                state_changed = True
                if len(resourcelist) > 0:
                    diff_end_resources += len(resourcelist)
                    zip_resource = self.create_zip(
                        resourcelist, PREFIX_END_PART, True,
                        self.write_separate_manifest)
                    new_zips.add(zip_resource)

        # publish new metadata. Exclude zip_end_old
        if state_changed:
            self.publish_metadata(new_zips, path_zip_end_old)

        # remove old zip end file, resource list and manifest;
        # account for difference of resources provisionally packaged.
        if state_changed and path_zip_end_old:
            diff_end_resources -= len(rl_end_old)
            os.remove(path_zip_end_old)
            os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml")
            manifest = PREFIX_MANIFEST + os.path.splitext(
                os.path.basename(path_zip_end_old))[0] + ".xml"
            manifest_file = os.path.join(self.publish_dir, manifest)
            if os.path.isfile(manifest_file):
                os.remove(manifest_file)

        return state_changed, count_def_resources, diff_end_resources
Ejemplo n.º 10
0
    def do_publish(self):
        """
        Publish resources found in resource_dir in accordance with the Resource Sync Framework.
        Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file
        is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources
        have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder
        of resources are packaged in a zip file marked as zip end.

        WARNING: This method removes resources that are published in packages marked as complete from resource_dir.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        count_def_resources = 0
        diff_end_resources = 0
        path_zip_end_old, rl_end_old = self.get_state_published()

        new_zips = ResourceDump()
        state_changed = False
        exhausted = False

        while not exhausted:
            resourcelist, exhausted = self.list_resources_chunk()

            if len(resourcelist) == self.max_files_compressed:  # complete zip
                state_changed = True
                count_def_resources += len(resourcelist)
                zip_resource = self.create_zip(resourcelist, PREFIX_COMPLETED_PART, False, self.write_separate_manifest)
                new_zips.add(zip_resource)
                # move resources from resource_dir
                for resource in resourcelist:
                    r_path = os.path.join(self.resource_dir, resource.path)
                    if self.move_resources:
                        shutil.move(r_path, self.publish_dir)
                    else:
                        os.remove(r_path)
            elif not self.is_same(resourcelist, rl_end_old):
                assert exhausted
                state_changed = True
                if len(resourcelist) > 0:
                    diff_end_resources += len(resourcelist)
                    zip_resource = self.create_zip(resourcelist, PREFIX_END_PART, True, self.write_separate_manifest)
                    new_zips.add(zip_resource)

        # publish new metadata. Exclude zip_end_old
        if state_changed:
            self.publish_metadata(new_zips, path_zip_end_old)

        # remove old zip end file, resource list and manifest;
        # account for difference of resources provisionally packaged.
        if state_changed and path_zip_end_old:
            diff_end_resources -= len(rl_end_old)
            os.remove(path_zip_end_old)
            os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml")
            manifest = PREFIX_MANIFEST + os.path.splitext(os.path.basename(path_zip_end_old))[0] + ".xml"
            manifest_file = os.path.join(self.publish_dir, manifest)
            if os.path.isfile(manifest_file):
                os.remove(manifest_file)

        return state_changed, count_def_resources, diff_end_resources