Exemple #1
0
    def explore_uri(self, uri, caps):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s=Sitemap()
        print "Reading %s" % (uri)
        try:
            list = s.parse_xml(urllib.urlopen(uri))
        except IOError as e:
            raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e)))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        if (s.parsed_index):
            capability += 'index'
        print "Parsed %s document with %d entries:" % (capability,num_entries)
        if (caps is not None and capability not in caps):
            print "WARNING - expected a %s document" % (','.join(caps))
        to_show = num_entries
        if (num_entries>21):
            to_show = 20
        # What entries are allowed? 
        # FIXME - not complete
        if (capability == 'capabilitylistindex'):
            entry_caps = ['capabilitylist']
        elif (capability == 'capabilitylist'):
            entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex']
        elif (capability == 'changelistindex'):
            entry_caps = ['changelist']
        n = 0
        options = {}
        for r in list.resources:
            if (n>=to_show):
                print "(not showing remaining %d entries)" % (num_entries-n)
                last
            n+=1
            options[str(n)]=r
            print "[%d] %s" % (n,r.uri)
            if (r.capability is not None):
                warning = ''
                if (r.capability not in entry_caps):
                    warning = " (EXPECTED %s)" % (' or '.join(entry_caps))
                print "  %s%s" % (r.capability,warning)
            elif (len(entry_caps)==1):
                r.capability=entry_caps[0]
                print "  capability not specified, should be %s" % (r.capability)
        while (True):
            inp = raw_input( "Follow [number or q(uit)]?" )
            if (inp in options.keys()):
                break
            if (inp == 'q'):
                return('','',inp)
        caps = [ options[inp].capability ]
        if (capability == 'capabilitylistindex'):
            # all links should be to capabilitylist documents
            if (caps is None):
                caps = ['capabilitylist']
        return( options[inp].uri, caps, inp )
Exemple #2
0
def sync_incremental(map, counter, base_url, from_date, to_date):
    """Run resync incremental."""
    # init_logging(verbose=True)
    from .resync import ResourceSyncClient
    client = ResourceSyncClient()
    client.ignore_failures = True
    try:
        single_sync_incremental(map, counter, base_url, from_date, to_date)
        return True
    except MapperError as e:
        current_app.logger.info(e)
        paths = map[0].rsplit('/', 1)
        map[0] = paths[0]
    except Exception as e:
        # maybe url contain a list of changelist, instead of changelist
        current_app.logger.info(e)
        s = Sitemap()
        try:
            docs = s.parse_xml(url_or_file_open(base_url))
        except IOError as ioerror:
            raise ioerror
        if docs:
            for doc in docs:
                # make sure sub url is a changelist/ changedump
                capability = read_capability(doc.uri)
                if capability is None:
                    raise ('Bad URL, not a changelist/changedump,'
                           ' cannot sync incremental')
                if capability != 'changelist' and capability != 'changedump':
                    raise ('Bad URL, not a changelist/changedump,'
                           ' cannot sync incremental')
                single_sync_incremental(map, counter, doc.uri, from_date,
                                        to_date)
            return True
        raise e
Exemple #3
0
    def read_reference_sitemap(self, ref_sitemap, name="reference"):
        """Read reference sitemap and return the inventory

        name parameter just uses in output messages to say what type
        of sitemap is being read.
        """
        sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper)
        self.logger.info("Reading %s sitemap(s) from %s ..." % (name, ref_sitemap))
        i = sitemap.read(ref_sitemap)
        num_entries = len(i)
        self.logger.warning(
            "Read %s sitemap with %d entries in %d sitemaps" % (name, num_entries, sitemap.sitemaps_created)
        )
        if self.verbose:
            to_show = 100
            override_str = " (override with --max-sitemap-entries)"
            if self.max_sitemap_entries:
                to_show = self.max_sitemap_entries
                override_str = ""
            if num_entries > to_show:
                print "Showing first %d entries sorted by URI%s..." % (to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if n >= to_show:
                    break
        return i
Exemple #4
0
 def parse_document(self):
     """Parse any ResourceSync document and show information
     
     Will use sitemap URI taken either from explicit self.sitemap_name
     or derived from the mappings supplied.
     """
     s=Sitemap()
     self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
     try:
         list = s.parse_xml(urllib.urlopen(self.sitemap))
     except IOError as e:
         raise ClientFatalError("Cannot read document (%s)" % str(e))
     num_entries = len(list.resources)
     capability = '(unknown capability)'
     if ('capability' in list.md):
         capability = list.md['capability']
     print "Parsed %s document with %d entries" % (capability,num_entries)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries>to_show):
             print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
         n=0
         for resource in list:
             print '[%d] %s' % (n,str(resource))
             n+=1
             if ( n >= to_show ):
                 break
Exemple #5
0
    def read_reference_sitemap(self, ref_sitemap, name='reference'):
        """Read reference sitemap and return the inventory

        name parameter just uses in output messages to say what type
        of sitemap is being read.
        """
        sitemap = Sitemap(allow_multifile=self.allow_multifile,
                          mapper=self.mapper)
        self.logger.info("Reading %s sitemap(s) from %s ..." %
                         (name, ref_sitemap))
        i = sitemap.read(ref_sitemap)
        num_entries = len(i)
        self.logger.warning("Read %s sitemap with %d entries in %d sitemaps" %
                            (name, num_entries, sitemap.sitemaps_created))
        if (self.verbose):
            to_show = 100
            override_str = ' (override with --max-sitemap-entries)'
            if (self.max_sitemap_entries):
                to_show = self.max_sitemap_entries
                override_str = ''
            if (num_entries > to_show):
                print "Showing first %d entries sorted by URI%s..." % (
                    to_show, override_str)
            n = 0
            for r in i:
                print r
                n += 1
                if (n >= to_show):
                    break
        return (i)
 def test2_pretty_output(self):
     ib = InventoryBuilder()
     ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     s.pretty_xml=True
     self.assertEqual(s.inventory_as_xml(i),'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
    def publish(self):
        """
        Try and publish or remove zip end if something went wrong.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        if not os.path.isdir(self.resource_dir):
            os.makedirs(self.resource_dir)
            #print "Created %s" % self.resource_dir

        if not os.path.isdir(self.publish_dir):
            os.makedirs(self.publish_dir)
            #print "Created %s" % self.publish_dir

        try:
            return self.do_publish()
        except:
            # Something went wrong. Best we can do is clean up end of zip chain.
            zip_end_files = glob(
                os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
            for ze_file in zip_end_files:
                os.remove(ze_file)
                print "error recovery: removed %s" % ze_file

            zip_end_xmls = glob(
                os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml"))
            for ze_xml in zip_end_xmls:
                os.remove(ze_xml)
                print "error recovery: removed %s" % ze_xml

            zip_end_manis = glob(
                os.path.join(self.publish_dir,
                             PREFIX_MANIFEST + PREFIX_END_PART + "*.xml"))
            for ze_mani in zip_end_manis:
                os.remove(ze_mani)
                print "error recovery: removed %s" % ze_mani

            # remove zip-end entries from resource-dump.xml
            rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
            rs_dump = ResourceDump()
            if os.path.isfile(rs_dump_path):
                with open(rs_dump_path, "r") as rs_dump_file:
                    sm = Sitemap()
                    sm.parse_xml(rs_dump_file, resources=rs_dump)

            prefix = self.publish_url + PREFIX_END_PART

            for uri in rs_dump.resources.keys():
                if uri.startswith(prefix):
                    del rs_dump.resources[uri]
                    print "error recovery: removed %s from %s" % (uri,
                                                                  rs_dump_path)

            with open(rs_dump_path, "w") as rs_dump_file:
                rs_dump_file.write(rs_dump.as_xml())

            print "error recovery: walk through error recovery completed. Now raising ..."
            raise
Exemple #8
0
    def test_19_parse_with_bad_rs_ln(self):
        xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist"/>\
<url><loc>http://example.com/file_a</loc>'

        xmlend = '</url></urlset>'
        s = Sitemap()
        #
        # missing href
        xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # missing rel
        xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad length
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad pri
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # and finally OK with errors fixes
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual(len(rc.resources), 1,
                         'good at last, extra attribute ignored')
Exemple #9
0
    def test_18_parse_with_rs_ln_on_resource(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability=\"resourcelist\"/>\
<url>\
  <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md hash=\"md5:r2d2\" length=\"12345\" />\
  <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\
  <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\
  <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\
</url>\
<url>\
  <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md length=\"32\" />\
</url>\
</urlset>'
        s=Sitemap()
        rc=s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse( s.parsed_index, 'was a sitemap')
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
        i = iter(rc)
        r1 = next(i)
        r2 = next(i)
        self.assertEqual( r1.uri, 'http://example.com/file_a' )
        self.assertEqual( r1.ln[0]['rel'], 'duplicate' )
        self.assertEqual( r1.ln[0]['href'], 'http://mirror1.example.com/res1' )
        self.assertEqual( r1.ln[0]['modified'], '2013-01-02' )
        self.assertEqual( r1.ln[0]['pri'], 1 )
        self.assertEqual( r2.uri, 'http://example.com/file_b' )
Exemple #10
0
 def parse_document(self):
     """Parse any ResourceSync document and show information
     
     Will use sitemap URI taken either from explicit self.sitemap_name
     or derived from the mappings supplied.
     """
     s = Sitemap()
     self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
     try:
         list = s.parse_xml(urllib.urlopen(self.sitemap))
     except IOError as e:
         raise ClientFatalError("Cannot read document (%s)" % str(e))
     num_entries = len(list.resources)
     capability = '(unknown capability)'
     if ('capability' in list.md):
         capability = list.md['capability']
     print "Parsed %s document with %d entries" % (capability, num_entries)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries > to_show):
             print "Showing first %d entries sorted by URI%s..." % (
                 to_show, override_str)
         n = 0
         for resource in list:
             print '[%d] %s' % (n, str(resource))
             n += 1
             if (n >= to_show):
                 break
Exemple #11
0
    def test_19_parse_with_bad_rs_ln(self):
        xmlstart='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist"/>\
<url><loc>http://example.com/file_a</loc>'
        xmlend='</url></urlset>'
        s=Sitemap()
        #
        # missing href
        xml=xmlstart+'<rs:ln rel="duplicate"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # missing rel
        xml=xmlstart+'<rs:ln href="http://example.com/"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad length
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="a"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad pri
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # and finally OK with errors fixes
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>'+xmlend
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual( len(rc.resources), 1, 'good at last, extra attribute ignored' )
    def get_state_published(self):
        """
        See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist
        (with local paths) of resources published in the zip end file.
        :return:    - the path to the zip end file or None if there is no zip end file.
                    - the resourcelist of resources published in zip end file or an empty list if there is no zip end file.
        """
        path_zip_end_old = None
        rl_end_old = ResourceList()

        zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
        if len(zip_end_files) > 1:
            raise RuntimeError(
                "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir)
            )
        elif len(zip_end_files) == 1:
            path_zip_end_old = zip_end_files[0]

        if path_zip_end_old:
            rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r")
            sm = Sitemap()
            sm.parse_xml(rl_file, resources=rl_end_old)
            rl_file.close()

        return path_zip_end_old, rl_end_old
 def test2_pretty_output(self):
     ib = InventoryBuilder()
     ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     s.pretty_xml=True
     self.assertEqual(s.resources_as_xml(i),'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
Exemple #14
0
    def parse_xml(self,
                  fh=None,
                  etree=None,
                  resources=None,
                  capability=None,
                  sitemapindex=None):
        """Parse XML Sitemap and add to resources object.

        Reads from fh or etree and adds resources to a resorces object
        (which must support the add method). Returns the resources object.

        Also sets self.resources_created to be the number of resources created.
        We adopt a very lax approach here. The parsing is properly namespace
        aware but we search just for the elements wanted and leave everything
        else alone.

        This method will read either sitemap or sitemapindex documents. Behavior
        depends on the sitemapindex parameter:
        - None - will read either
        - False - SitemapIndexError exception if sitemapindex detected
        - True - SitemapIndexError exception if sitemap detected

        Will set self.parsed_index based on whether a sitemap or sitemapindex
        document was read:
        - False - sitemap
        - True - sitemapindex
        """

        sitemap = Sitemap()
        self.res_container = sitemap.parse_xml(fh=fh,
                                               etree=etree,
                                               resources=resources,
                                               capability=capability,
                                               sitemapindex=sitemapindex)
        return self.res_container
    def get_state_published(self):
        """
        See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist
        (with local paths) of resources published in the zip end file.
        :return:    - the path to the zip end file or None if there is no zip end file.
                    - the resourcelist of resources published in zip end file or an empty list if there is no zip end file.
        """
        path_zip_end_old = None
        rl_end_old = ResourceList()

        zip_end_files = glob(
            os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
        if len(zip_end_files) > 1:
            raise RuntimeError(
                "Found more than one %s*.zip files. Inconsistent structure of %s."
                % (PREFIX_END_PART, self.publish_dir))
        elif len(zip_end_files) == 1:
            path_zip_end_old = zip_end_files[0]

        if path_zip_end_old:
            rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r")
            sm = Sitemap()
            sm.parse_xml(rl_file, resources=rl_end_old)
            rl_file.close()

        return path_zip_end_old, rl_end_old
Exemple #16
0
 def test_20_parse_sitemapindex_empty(self):
     s = Sitemap()
     si = s.sitemapindex_parse_xml(
         fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>')
     )
     self.assertEqual(s.sitemaps_created, 0, "0 sitemaps in sitemapindex")
     self.assertEqual(len(si.resources), 0, "0 sitemaps")
    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" %
                                   (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(
            self.publish_dir)).rstrip('\n')

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
Exemple #18
0
    def test_18_parse_with_rs_ln_on_resource(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability=\"resourcelist\"/>\
<url>\
  <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md hash=\"md5:r2d2\" length=\"12345\" />\
  <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\
  <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\
  <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\
</url>\
<url>\
  <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md length=\"32\" />\
</url>\
</urlset>'

        s = Sitemap()
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 2, 'got 2 resources')
        i = iter(rc)
        r1 = next(i)
        r2 = next(i)
        self.assertEqual(r1.uri, 'http://example.com/file_a')
        self.assertEqual(r1.ln[0]['rel'], 'duplicate')
        self.assertEqual(r1.ln[0]['href'], 'http://mirror1.example.com/res1')
        self.assertEqual(r1.ln[0]['modified'], '2013-01-02')
        self.assertEqual(r1.ln[0]['pri'], 1)
        self.assertEqual(r2.uri, 'http://example.com/file_b')
Exemple #19
0
 def write_static_inventory(self):
     """Writes the inventory to the filesystem"""
     # Generate sitemap in temp directory
     then = time.time()
     self.ensure_temp_dir(Source.TEMP_FILE_PATH)
     inventory = self.generate()
     basename = Source.TEMP_FILE_PATH + "/sitemap.xml"
     s=Sitemap()
     s.max_sitemap_entries=self.config['max_sitemap_entries']
     s.mapper=Mapper([self.source.base_uri, Source.TEMP_FILE_PATH])
     s.write(inventory, basename)
     # Delete old sitemap files; move the new ones; delete the temp dir
     self.rm_sitemap_files(Source.STATIC_FILE_PATH)
     self.mv_sitemap_files(Source.TEMP_FILE_PATH, Source.STATIC_FILE_PATH)
     shutil.rmtree(Source.TEMP_FILE_PATH)
     now = time.time()
     # Log Sitemap create start event
     sitemap_size = self.compute_sitemap_size(Source.STATIC_FILE_PATH)
     log_data = {'time': (now-then), 
                 'no_resources': self.source.resource_count}
     self.logger.info("Wrote static sitemap inventory. %s" % log_data)
     sm_write_end = ResourceChange(
             resource = ResourceChange(self.uri, 
                             size=sitemap_size,
                             timestamp=then),
                             changetype = "UPDATED")
     self.source.notify_observers(sm_write_end)
 def read_sitemap(self, path, sitemap=None):
     if sitemap is None:
         sitemap = ListBaseWithIndex()
     with open(path, "r", encoding="utf-8") as file:
         sm = Sitemap()
         sm.parse_xml(file, resources=sitemap)
     return sitemap
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     i = ib.from_disk('resync/test/testdata/dir1','http://example.org/t')
     s = Sitemap()
     xml = s.inventory_as_xml(i)
     self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>',xml), 'size/checksum for file_a')
     self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>',xml), 'size/checksum for file_b' )
Exemple #22
0
 def test_20_parse_sitemapindex_empty(self):
     s = Sitemap()
     si = s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>'
     ),
                      sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 0, '0 sitemaps')
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     xml = s.resources_as_xml(i)
     self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>',xml) ) #must escape + in md5
     self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>',xml) )
Exemple #24
0
 def test_22_parse_sitemapindex_file(self):
     s=Sitemap()
     fh=open('tests/testdata/sitemapindex1/sitemap.xml','r')
     si = s.parse_xml( fh=fh, sitemapindex=True )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )
    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n")

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
Exemple #26
0
 def test2_pretty_output(self):
     ib = InventoryBuilder()
     i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t')
     s = Sitemap()
     s.pretty_xml = True
     self.assertEqual(
         s.inventory_as_xml(i),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url>\n</urlset>'
     )
Exemple #27
0
 def test_22_parse_sitemapindex_file(self):
     s=Sitemap()
     fh=open('resync/test/testdata/sitemapindex1/sitemap.xml')
     si = s.sitemapindex_parse_xml( fh=fh )
     self.assertEqual( s.sitemaps_created, 3, '3 sitemaps in sitemapindex')
     self.assertEqual( len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )
     self.assertEqual( si.resources['http://localhost:8888/sitemap00000.xml'].lastmod, '2012-06-13T18:09:13Z' )
Exemple #28
0
 def save_sitemap(self, sitemap, path):
     # writing the string sitemap.as_xml() to disk results in encoding=ASCII on some systems.
     # due to https://docs.python.org/3.4/library/xml.etree.elementtree.html#write
     sitemap.default_capability()
     with open(path, "wb") as f:
         s = Sitemap(pretty_xml=self.para.is_saving_pretty_xml)
         s.resources_as_xml(sitemap,
                            sitemapindex=sitemap.sitemapindex,
                            fh=f)
Exemple #29
0
    def test_11_parse_2(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:size>12</rs:size></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:size>32</rs:size></url>\
</urlset>'
        s=Sitemap()
        i=s.resourcelist_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
Exemple #30
0
    def test_11_parse_2(self):
        xml = "<?xml version='1.0' encoding='UTF-8'?>\n\
<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\
</urlset>"
        s = Sitemap()
        i = s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual(s.resources_created, 2, "got 2 resources")
Exemple #31
0
 def test_22_parse_sitemapindex_file(self):
     s=Sitemap()
     fh=open('resync/test/testdata/sitemapindex1/sitemap.xml')
     si = s.sitemapindex_parse_xml( fh=fh )
     self.assertEqual( s.sitemaps_created, 3, '3 sitemaps in sitemapindex')
     self.assertEqual( len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )
     self.assertEqual( si.resources['http://localhost:8888/sitemap00000.xml'].lastmod, '2012-06-13T18:09:13' )
Exemple #32
0
    def test_11_parse_2(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\
</urlset>'
        s=Sitemap()
        i=s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
 def test_ex2_1(self):
     """ex2_1 is a simple resourcelist with 2 resources, no metadata"""
     s=Sitemap()
     fh=open('resync/test/testdata/examples_from_spec/ex2_1.xml')
     si = s.resourcelist_parse_xml( fh=fh )
     self.assertEqual( len(si.resources), 2, '2 resources')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] )
     self.assertEqual( si.resources['http://example.com/res1'].lastmod, None )
Exemple #34
0
    def read_source(self):
        """
        Read the source_uri and parse it to source_document.
        :return: True if the document was downloaded and parsed without exceptions, False otherwise.
        """
        session = requests.Session()
        try:
            response = session.get(self.source_uri)
            self.source_status = response.status_code
            self.logger.debug("Read %s, status %s" % (self.source_uri, str(self.source_status)))
            assert self.source_status == 200, "Invalid response status: %d" % self.source_status

            text = response.text

            root = ET.fromstring(text)
            self.is_index = root.tag == SITEMAP_INDEX_ROOT

            etree = ET.ElementTree(root)
            sitemap = Sitemap()
            self.source_document = sitemap.parse_xml(etree=etree)
            # the source_document is a resync.resource_container.ResourceContainer
            capability = self.source_document.capability
            assert capability == self.capability, "Capability is not %s but %s" % (self.capability, capability)
            # anyone interested in sitemaps?
            for processor_listener in processor_listeners:
                processor_listener.event_sitemap_received(self.source_uri, capability, text)

            self.describedby_url = self.source_document.describedby
            self.up_url = self.source_document.up  # to a parent non-index document
            self.index_url = self.source_document.index  # to a parent index document
            self.status = Status.document

        except requests.exceptions.ConnectionError as err:
            self.logger.debug("%s No connection: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except xml.etree.ElementTree.ParseError as err:
            self.logger.debug("%s ParseError: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except resync.sitemap.SitemapParseError as err:
            self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except AssertionError as err:
            self.logger.debug("%s Error: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        finally:
            session.close()

        return self.status == Status.document
Exemple #35
0
    def test_11_parse_2(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\
</urlset>'
        fh=StringIO.StringIO(xml)
        s=Sitemap()
        i=s.inventory_parse_xml(fh)
        self.assertEqual( s.resources_added, 2, 'got 2 resources')
    def synchronize(self):
        """
        Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir.
        """
        if not os.path.isdir(self.source_dir):
            os.makedirs(self.source_dir)
            print "Created %s" % self.source_dir

        if not os.path.isdir(self.sink_dir):
            os.makedirs(self.sink_dir)
            print "Created %s" % self.sink_dir

        self.handshake = self.verify_handshake()
        if self.handshake is None:
            return
        ####################

        # print "Synchronizing state as of %s" % self.handshake

        ### initial resource description
        wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN)
        if not os.path.isdir(wellknown):
            os.makedirs(wellknown)

        src_desc = SourceDescription()
        new_src_desc = True
        # Load existing resource-description, if any.
        if os.path.isfile(self.src_desc_path):
            new_src_desc = False
            with open(self.src_desc_path, "r") as src_desc_file:
                sm = Sitemap()
                sm.parse_xml(src_desc_file, resources=src_desc)

        count_lists = len(src_desc.resources)

        ### resources in subdirectories or main directory
        ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories.
        index_file = os.path.join(self.source_dir, FILE_INDEX)
        if os.path.isfile(index_file):
            for dirname in os.walk(self.source_dir).next()[1]:
                source = os.path.join(self.source_dir, dirname)
                sink = os.path.join(self.sink_dir, dirname)
                publish_url = self.publish_url + dirname + "/"
                self.__execute_sync__(source, sink, publish_url, src_desc)
        else:
            self.__execute_sync__(self.source_dir, self.sink_dir,
                                  self.publish_url, src_desc)

        if new_src_desc or count_lists != len(src_desc.resources):
            ### publish resource description
            with open(self.src_desc_path, "w") as src_desc_file:
                src_desc_file.write(src_desc.as_xml())
                print "New resource description. See %s" % self.src_desc_url

        self.report()
    def publish(self):
        """
        Try and publish or remove zip end if something went wrong.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        if not os.path.isdir(self.resource_dir):
            os.makedirs(self.resource_dir)
            # print "Created %s" % self.resource_dir

        if not os.path.isdir(self.publish_dir):
            os.makedirs(self.publish_dir)
            # print "Created %s" % self.publish_dir

        try:
            return self.do_publish()
        except:
            # Something went wrong. Best we can do is clean up end of zip chain.
            zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
            for ze_file in zip_end_files:
                os.remove(ze_file)
                print "error recovery: removed %s" % ze_file

            zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml"))
            for ze_xml in zip_end_xmls:
                os.remove(ze_xml)
                print "error recovery: removed %s" % ze_xml

            zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml"))
            for ze_mani in zip_end_manis:
                os.remove(ze_mani)
                print "error recovery: removed %s" % ze_mani

            # remove zip-end entries from resource-dump.xml
            rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
            rs_dump = ResourceDump()
            if os.path.isfile(rs_dump_path):
                with open(rs_dump_path, "r") as rs_dump_file:
                    sm = Sitemap()
                    sm.parse_xml(rs_dump_file, resources=rs_dump)

            prefix = self.publish_url + PREFIX_END_PART

            for uri in rs_dump.resources.keys():
                if uri.startswith(prefix):
                    del rs_dump.resources[uri]
                    print "error recovery: removed %s from %s" % (uri, rs_dump_path)

            with open(rs_dump_path, "w") as rs_dump_file:
                rs_dump_file.write(rs_dump.as_xml())

            print "error recovery: walk through error recovery completed. Now raising ..."
            raise
Exemple #38
0
    def test_11_parse_2(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\
</urlset>'

        s = Sitemap()
        i = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 2, 'got 2 resources')
Exemple #39
0
def read_capability(url):
    """Read capability of an url."""
    s = Sitemap()
    capability = None
    try:
        document = s.parse_xml(url_or_file_open(url))
    except IOError as e:
        raise e
    if 'capability' in document.md:
        capability = document.md['capability']
    return capability
 def test2_pretty_output(self):
     ib = InventoryBuilder()
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     s.pretty_xml = True
     self.assertEqual(
         s.resources_as_xml(i),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>'
     )
    def synchronize(self):
        """
        Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir.
        """
        if not os.path.isdir(self.source_dir):
            os.makedirs(self.source_dir)
            print "Created %s" % self.source_dir

        if not os.path.isdir(self.sink_dir):
            os.makedirs(self.sink_dir)
            print "Created %s" % self.sink_dir

        self.handshake = self.verify_handshake()
        if self.handshake is None:
            return
        ####################

        # print "Synchronizing state as of %s" % self.handshake

        ### initial resource description
        wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN)
        if not os.path.isdir(wellknown):
            os.makedirs(wellknown)

        src_desc = SourceDescription()
        new_src_desc = True
        # Load existing resource-description, if any.
        if os.path.isfile(self.src_desc_path):
            new_src_desc = False
            with open(self.src_desc_path, "r") as src_desc_file:
                sm = Sitemap()
                sm.parse_xml(src_desc_file, resources=src_desc)

        count_lists = len(src_desc.resources)

        ### resources in subdirectories or main directory
        ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories.
        index_file = os.path.join(self.source_dir, FILE_INDEX)
        if os.path.isfile(index_file):
            for dirname in os.walk(self.source_dir).next()[1]:
                source = os.path.join(self.source_dir, dirname)
                sink = os.path.join(self.sink_dir, dirname)
                publish_url = self.publish_url + dirname + "/"
                self.__execute_sync__(source, sink, publish_url, src_desc)
        else:
            self.__execute_sync__(self.source_dir, self.sink_dir, self.publish_url, src_desc)

        if new_src_desc or count_lists != len(src_desc.resources):
            ### publish resource description
            with open(self.src_desc_path, "w") as src_desc_file:
                src_desc_file.write(src_desc.as_xml())
                print "New resource description. See %s" % self.src_desc_url

        self.report()
Exemple #42
0
 def write_static_inventory(self):
     """Writes the inventory to the filesystem"""
     self.generate()
     self.delete_sitemap_files()
     basename = Source.STATIC_FILE_PATH + "/sitemap.xml"
     then = time.time()
     s=Sitemap()
     s.max_sitemap_entries=self.config['max_sitemap_entries']
     s.mapper=Mapper([self.source.base_uri, Source.STATIC_FILE_PATH])
     s.write(self, basename)
     now = time.time()
     print "Wrote static sitemap in %s seconds" % str(now-then)
Exemple #43
0
 def test_21_parse_sitemapindex(self):
     s=Sitemap()
     si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'), sitemapindex=True )
     self.assertEqual( len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['aaa','bbb'] )
     # add a couple more
     s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), resources=si )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
Exemple #44
0
 def write_changeset(self):
     """Writes all cached changes to a file; empties the cache"""
     then = time.time()
     changeset = self.generate()
     basename = Source.STATIC_FILE_PATH + "/" + self.current_changeset_file()
     s=Sitemap()
     s.max_sitemap_entries=self.config['max_sitemap_entries']
     s.mapper=Mapper([self.source.base_uri, Source.STATIC_FILE_PATH])
     s.write(changeset, basename)
     now = time.time()
     self.previous_changeset_id = self.previous_changeset_id + 1
     self.logger.info("Wrote static changeset..")
 def test_ex2_2(self):
     """ex2_2 is a simple resourcelist with 2 resources, some metadata"""
     s=Sitemap()
     fh=open('resync/test/testdata/examples_from_spec/ex2_2.xml')
     si = s.resourcelist_parse_xml( fh=fh )
     self.assertEqual( len(si.resources), 2, '2 resources')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] )
     self.assertEqual( si.resources['http://example.com/res1'].lastmod, '2013-01-02T14:00:00Z' )
     self.assertEqual( si.resources['http://example.com/res2'].lastmod, '2013-01-02T13:00:00Z' )
     self.assertEqual( si.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' )
     self.assertEqual( si.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' )
Exemple #46
0
 def test_22_parse_sitemapindex_file(self):
     s = Sitemap()
     fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r')
     si = s.parse_xml(fh=fh, sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual(sms, [
         'http://localhost:8888/sitemap00000.xml',
         'http://localhost:8888/sitemap00001.xml',
         'http://localhost:8888/sitemap00002.xml'
     ])
 def test_ex2_3(self):
     """ex2_3 is a simple changelist with 2 resources"""
     s=Sitemap()
     fh=open('resync/test/testdata/examples_from_spec/ex2_3.xml')
     si = s.resourcelist_parse_xml( fh=fh )
     self.assertEqual( len(si.resources), 2, '2 resources')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['http://example.com/res2.pdf','http://example.com/res3.tiff'] )
     self.assertEqual( si.resources['http://example.com/res2.pdf'].lastmod, '2013-01-02T18:00:00Z' )
     self.assertEqual( si.resources['http://example.com/res3.tiff'].lastmod, '2013-01-02T13:00:00Z' )
     self.assertEqual( si.resources['http://example.com/res2.pdf'].change, 'updated' )
     self.assertEqual( si.resources['http://example.com/res3.tiff'].change, 'deleted' )
Exemple #48
0
 def test_21_parse_sitemapindex(self):
     s=Sitemap()
     si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>') )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps in sitemapindex')
     self.assertEqual( len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb'] )
     # add a couple more
     s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), sitemapindex=si )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps created to sitemapindex')
     self.assertEqual( len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
Exemple #49
0
 def explore_links_get(self, uri, seen=[]):
     # Check we haven't been here before
     if (uri in seen):
         self.logger.warning("Already see %s, skipping" % (uri))
     s = Sitemap(allow_multifile=self.allow_multifile)
     self.logger.info("Reading sitemap from %s ..." % (uri))
     i = s.read(uri, index_only=True)
     self.logger.warning("Read %s from %s" % (s.read_type, uri))
     links = self.extract_links(i, verbose=True)
     if ('next' in links and links['next'] == uri):
         self.logger.warning("- self reference \"next\" link")
     seen[uri] = links
     return (s.changeset_read, links)
Exemple #50
0
 def test_21_parse_sitemapindex(self):
     s=Sitemap()
     si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>') )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps in sitemapindex')
     self.assertEqual( len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb'] )
     # add a couple more
     s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), sitemapindex=si )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps created to sitemapindex')
     self.assertEqual( len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
Exemple #51
0
 def explore_links_get(self, uri, seen=[]):
     # Check we haven't been here before
     if uri in seen:
         self.logger.warning("Already see %s, skipping" % (uri))
     s = Sitemap(allow_multifile=self.allow_multifile)
     self.logger.info("Reading sitemap from %s ..." % (uri))
     i = s.read(uri, index_only=True)
     self.logger.warning("Read %s from %s" % (s.read_type, uri))
     links = self.extract_links(i, verbose=True)
     if "next" in links and links["next"] == uri:
         self.logger.warning('- self reference "next" link')
     seen[uri] = links
     return (s.changeset_read, links)
Exemple #52
0
 def changeset_sitemap(self,
                       outfile=None,
                       ref_sitemap=None,
                       capabilities=None,
                       dump=None):
     # 1. Get and parse reference sitemap
     rs = Sitemap(verbose=self.verbose,
                  allow_multifile=self.allow_multifile,
                  mapper=self.mapper)
     if (self.verbose):
         print "Reading sitemap(s) from %s ..." % (ref_sitemap)
     ri = rs.read(ref_sitemap)
     num_entries = len(ri)
     print "Read reference sitemap with %d entries in %d sitemaps" % (
         num_entries, rs.sitemaps_created)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries > to_show):
             print "Showing first %d entries sorted by URI%s..." % (
                 to_show, override_str)
         n = 0
         for r in ri.resource_uris():
             print ri.resources[r]
             n += 1
             if (n >= to_show):
                 break
     # 2. Set up base_path->base_uri mappings, get inventory from disk
     disk_inventory = self.inventory
     # 3. Calculate changeset
     (num_same, updated, deleted, created) = ri.compare(disk_inventory)
     changeset = Inventory()
     changeset.capabilities = capabilities
     changeset.add(disk_inventory.changeset(updated, changetype='updated'))
     changeset.add(ri.changeset(deleted, changetype='deleted'))
     changeset.add(disk_inventory.changeset(created, changetype='created'))
     # 4. Write out changeset
     s = Sitemap(verbose=self.verbose,
                 pretty_xml=True,
                 allow_multifile=self.allow_multifile,
                 mapper=self.mapper)
     if (self.max_sitemap_entries is not None):
         s.max_sitemap_entries = self.max_sitemap_entries
     if (outfile is None):
         print s.inventory_as_xml(changeset)
     else:
         s.write(changeset, basename=outfile)
     self.write_dump_if_requested(changeset, dump)
Exemple #53
0
    def test_10_sitemap(self):
        xml = "<?xml version='1.0' encoding='UTF-8'?>\n\
<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\
</urlset>"
        s = Sitemap()
        i = s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual(s.resources_created, 1, "got 1 resources")
        r = i.resources["http://e.com/a"]
        self.assertTrue(r is not None, "got the uri expected")
        self.assertEqual(r.uri, "http://e.com/a")
        self.assertEqual(r.lastmod, "2012-03-14T18:37:36")
        self.assertEqual(r.size, 12)
        self.assertEqual(r.md5, "aabbccdd")
Exemple #54
0
def get_from_date_from_url(url):
    """Get smallest timestamp from url and parse to string."""
    s = Sitemap()
    try:
        document = s.parse_xml(url_or_file_open(url))
    except IOError as e:
        raise e
    date_list = []
    for item in document.resources:
        if item.timestamp:
            date_list.append(item.timestamp)
    if len(date_list) > 0:
        from_date = dt.fromtimestamp(min(date_list))
        return from_date.strftime("%Y-%m-%d")
Exemple #55
0
    def test_10_sitemap(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\
</urlset>'
        s=Sitemap()
        i=s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 1, 'got 1 resources')
        r=i.resources['http://e.com/a']
        self.assertTrue( r is not None, 'got the uri expected')
        self.assertEqual( r.uri, 'http://e.com/a' )
        self.assertEqual( r.lastmod, '2012-03-14T18:37:36' )
        self.assertEqual( r.size, 12 )
        self.assertEqual( r.md5, 'aabbccdd' )
Exemple #56
0
    def all_resources(self):
        all_resources = {}

        # search for resourcelists
        resourcelist_files = sorted(
            glob(self.paras.abs_metadata_path("resourcelist_*.xml")))
        for rl_file_name in resourcelist_files:
            resourcelist = ResourceList()
            with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                sm = Sitemap()
                sm.parse_xml(rl_file, resources=resourcelist)

            all_resources.update({
                resource.uri: resource
                for resource in resourcelist.resources
            })

        # search for changelists
        changelist_files = sorted(
            glob(self.paras.abs_metadata_path("changelist_*.xml")))
        for cl_file_name in changelist_files:
            changelist = ChangeList()
            with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                sm = Sitemap()
                sm.parse_xml(cl_file, resources=changelist)

            for resource in changelist.resources:
                if resource.change == "created" or resource.change == "updated":
                    all_resources.update({resource.uri: resource})
                elif resource.change == "deleted" and resource.uri in all_resources:
                    del all_resources[resource.uri]

        return all_resources
Exemple #57
0
    def test_30_parse_changeset(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:changetype>UP</rs:changetype></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size><rs:changeid>123</rs:changeid></url>\
</urlset>'
        s=Sitemap()
        s.resource_class=ResourceChange
        i=s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
        self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changetype, 'UP' )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changeid, None )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changetype, None )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changeid, '123' )
Exemple #58
0
    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s = Sitemap()
        print "Reading %s" % (uri)
        options = {}
        capability = None
        try:
            if (caps == 'resource'):
                self.explore_show_head(uri, check_headers=checks)
            else:
                list = s.parse_xml(urllib.urlopen(uri))
                (options, capability) = self.explore_show_summary(
                    list, s.parsed_index, caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options) == 0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt))
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return ('', '', '', inp)
        checks = {}
        if (options[inp].capability is None):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in [
                    'resourcelist', 'changelist', 'resourcedump', 'changedump'
            ]):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length'] = r.length
            if (r.lastmod is not None):
                checks['last-modified'] = r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return (options[inp].uri, checks, caps, inp)
Exemple #59
0
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t')
     s = Sitemap()
     xml = s.inventory_as_xml(i)
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>',
             xml), 'size/checksum for file_a')
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>',
             xml), 'size/checksum for file_b')
Exemple #60
0
 def test_01_print(self):
     i = Inventory()
     i.add( Resource(uri='a',lastmod='2001-01-01',size=1234) )
     i.capabilities['http://example.org/changeset1'] = \
         {"type": "changeset", "attributes": ["self next"]}
     self.assertEqual( len(i.capabilities), 1 )
     self.assertEqual( Sitemap().resources_as_xml(i, capabilities=i.capabilities), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/" xmlns:xhtml="http://www.w3.org/1999/xhtml_DEFANGED"><xhtml:link href="http://example.org/changeset1" rel="self next" type="changeset" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:size>1234</rs:size></url></urlset>' )