Python Sitemap.parse_xml Examples, resync.sitemap.Sitemap.parse_xml Python Examples

Example #1

0

Show file

File: zipsynchronizer.py Project: CLARIAH/virtuoso-quad-log

    def get_state_published(self):
        """
        See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist
        (with local paths) of resources published in the zip end file.
        :return:    - the path to the zip end file or None if there is no zip end file.
                    - the resourcelist of resources published in zip end file or an empty list if there is no zip end file.
        """
        path_zip_end_old = None
        rl_end_old = ResourceList()

        zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
        if len(zip_end_files) > 1:
            raise RuntimeError(
                "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir)
            )
        elif len(zip_end_files) == 1:
            path_zip_end_old = zip_end_files[0]

        if path_zip_end_old:
            rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r")
            sm = Sitemap()
            sm.parse_xml(rl_file, resources=rl_end_old)
            rl_file.close()

        return path_zip_end_old, rl_end_old

Example #2

0

Show file

File: zipsynchronizer.py Project: gdankov/virtuoso-quad-log

    def publish(self):
        """
        Try and publish or remove zip end if something went wrong.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        if not os.path.isdir(self.resource_dir):
            os.makedirs(self.resource_dir)
            #print "Created %s" % self.resource_dir

        if not os.path.isdir(self.publish_dir):
            os.makedirs(self.publish_dir)
            #print "Created %s" % self.publish_dir

        try:
            return self.do_publish()
        except:
            # Something went wrong. Best we can do is clean up end of zip chain.
            zip_end_files = glob(
                os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
            for ze_file in zip_end_files:
                os.remove(ze_file)
                print "error recovery: removed %s" % ze_file

            zip_end_xmls = glob(
                os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml"))
            for ze_xml in zip_end_xmls:
                os.remove(ze_xml)
                print "error recovery: removed %s" % ze_xml

            zip_end_manis = glob(
                os.path.join(self.publish_dir,
                             PREFIX_MANIFEST + PREFIX_END_PART + "*.xml"))
            for ze_mani in zip_end_manis:
                os.remove(ze_mani)
                print "error recovery: removed %s" % ze_mani

            # remove zip-end entries from resource-dump.xml
            rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
            rs_dump = ResourceDump()
            if os.path.isfile(rs_dump_path):
                with open(rs_dump_path, "r") as rs_dump_file:
                    sm = Sitemap()
                    sm.parse_xml(rs_dump_file, resources=rs_dump)

            prefix = self.publish_url + PREFIX_END_PART

            for uri in rs_dump.resources.keys():
                if uri.startswith(prefix):
                    del rs_dump.resources[uri]
                    print "error recovery: removed %s from %s" % (uri,
                                                                  rs_dump_path)

            with open(rs_dump_path, "w") as rs_dump_file:
                rs_dump_file.write(rs_dump.as_xml())

            print "error recovery: walk through error recovery completed. Now raising ..."
            raise

Example #3

0

Show file

File: test_resource_list.py Project: resync/resync

    def test33_write(self):
        # ResourceList
        rl = ResourceList()

        rl.add(Resource(uri='http://example.com/test/a', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/b', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/c', timestamp=1))

        rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml')
        rl.write(basename=rl_filename)

        with open(rl_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertFalse(s.parsed_index)

        # ResourceListIndex
        rli = ResourceList()

        rli.add(Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1))
        rli.add(Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1))
        rli.add(Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1))
        rli.sitemapindex = True

        rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml')
        rli.write(basename=rli_filename)

        with open(rli_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertTrue(s.parsed_index)

Example #4

0

Show file

File: executors.py Project: resourcesync/py-resourcesync

 def read_sitemap(self, path, sitemap=None):
     if sitemap is None:
         sitemap = ListBaseWithIndex()
     with open(path, "r", encoding="utf-8") as file:
         sm = Sitemap()
         sm.parse_xml(file, resources=sitemap)
     return sitemap

Example #5

0

Show file

File: zipsynchronizer.py Project: gdankov/virtuoso-quad-log

    def get_state_published(self):
        """
        See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist
        (with local paths) of resources published in the zip end file.
        :return:    - the path to the zip end file or None if there is no zip end file.
                    - the resourcelist of resources published in zip end file or an empty list if there is no zip end file.
        """
        path_zip_end_old = None
        rl_end_old = ResourceList()

        zip_end_files = glob(
            os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
        if len(zip_end_files) > 1:
            raise RuntimeError(
                "Found more than one %s*.zip files. Inconsistent structure of %s."
                % (PREFIX_END_PART, self.publish_dir))
        elif len(zip_end_files) == 1:
            path_zip_end_old = zip_end_files[0]

        if path_zip_end_old:
            rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r")
            sm = Sitemap()
            sm.parse_xml(rl_file, resources=rl_end_old)
            rl_file.close()

        return path_zip_end_old, rl_end_old

Example #6

0

Show file

File: zipsynchronizer.py Project: gdankov/virtuoso-quad-log

    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" %
                                   (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(
            self.publish_dir)).rstrip('\n')

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)

Example #7

0

Show file

File: transport.py Project: EHRI/rspub-core

    def all_resources(self):
        all_resources = {}

        # search for resourcelists
        resourcelist_files = sorted(
            glob(self.paras.abs_metadata_path("resourcelist_*.xml")))
        for rl_file_name in resourcelist_files:
            resourcelist = ResourceList()
            with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                sm = Sitemap()
                sm.parse_xml(rl_file, resources=resourcelist)

            all_resources.update({
                resource.uri: resource
                for resource in resourcelist.resources
            })

        # search for changelists
        changelist_files = sorted(
            glob(self.paras.abs_metadata_path("changelist_*.xml")))
        for cl_file_name in changelist_files:
            changelist = ChangeList()
            with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                sm = Sitemap()
                sm.parse_xml(cl_file, resources=changelist)

            for resource in changelist.resources:
                if resource.change == "created" or resource.change == "updated":
                    all_resources.update({resource.uri: resource})
                elif resource.change == "deleted" and resource.uri in all_resources:
                    del all_resources[resource.uri]

        return all_resources

Example #8

0

Show file

File: zipsynchronizer.py Project: CLARIAH/virtuoso-quad-log

    def publish_metadata(self, new_zips, exluded_zip=None):
        """
        (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published
        metadata.
        :param new_zips: a resourcelist with newly created zip resources
        :param exluded_zip: local path to zip file that will be removed from previously published metadata.
        """
        rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML
        rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
        capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML
        capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML)

        rs_dump = ResourceDump()

        # Load existing resource-dump, if any. Else set start time.
        if os.path.isfile(rs_dump_path):
            with open(rs_dump_path, "r") as rs_dump_file:
                sm = Sitemap()
                sm.parse_xml(rs_dump_file, resources=rs_dump)

        else:
            rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True)
            rs_dump.link_set(rel="up", href=capa_list_url)

        # Remove excluded zip, if any
        if exluded_zip:
            loc = self.publish_url + os.path.basename(exluded_zip)
            if loc in rs_dump.resources:
                del rs_dump.resources[loc]
            else:
                raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path))

        # Add new zips
        for resource in new_zips:
            rs_dump.add(resource)

        # Write resource-dump.xml
        rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True)
        with open(rs_dump_path, "w") as rs_dump_file:
            rs_dump_file.write(rs_dump.as_xml())

        # There are several ways to decode base64, among them
        # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n')
        # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n')
        iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n")

        print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri)
        print "See %s" % rs_dump_url

        # Write capability-list.xml
        if not os.path.isfile(capa_list_path):
            capa_list = CapabilityList()
            capa_list.link_set(rel="up", href=self.src_desc_url)
            capa_list.add_capability(rs_dump, rs_dump_url)
            with open(capa_list_path, "w") as capa_list_file:
                capa_list_file.write(capa_list.as_xml())

            print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)

Example #9

0

Show file

File: syncdirector.py Project: gdankov/virtuoso-quad-log

    def synchronize(self):
        """
        Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir.
        """
        if not os.path.isdir(self.source_dir):
            os.makedirs(self.source_dir)
            print "Created %s" % self.source_dir

        if not os.path.isdir(self.sink_dir):
            os.makedirs(self.sink_dir)
            print "Created %s" % self.sink_dir

        self.handshake = self.verify_handshake()
        if self.handshake is None:
            return
        ####################

        # print "Synchronizing state as of %s" % self.handshake

        ### initial resource description
        wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN)
        if not os.path.isdir(wellknown):
            os.makedirs(wellknown)

        src_desc = SourceDescription()
        new_src_desc = True
        # Load existing resource-description, if any.
        if os.path.isfile(self.src_desc_path):
            new_src_desc = False
            with open(self.src_desc_path, "r") as src_desc_file:
                sm = Sitemap()
                sm.parse_xml(src_desc_file, resources=src_desc)

        count_lists = len(src_desc.resources)

        ### resources in subdirectories or main directory
        ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories.
        index_file = os.path.join(self.source_dir, FILE_INDEX)
        if os.path.isfile(index_file):
            for dirname in os.walk(self.source_dir).next()[1]:
                source = os.path.join(self.source_dir, dirname)
                sink = os.path.join(self.sink_dir, dirname)
                publish_url = self.publish_url + dirname + "/"
                self.__execute_sync__(source, sink, publish_url, src_desc)
        else:
            self.__execute_sync__(self.source_dir, self.sink_dir,
                                  self.publish_url, src_desc)

        if new_src_desc or count_lists != len(src_desc.resources):
            ### publish resource description
            with open(self.src_desc_path, "w") as src_desc_file:
                src_desc_file.write(src_desc.as_xml())
                print "New resource description. See %s" % self.src_desc_url

        self.report()

Example #10

0

Show file

File: zipsynchronizer.py Project: CLARIAH/virtuoso-quad-log

    def publish(self):
        """
        Try and publish or remove zip end if something went wrong.

        :return: (  boolean indicating if change in sink directory or subdirectories,
                    amount of resources definitively packaged,
                    the difference of resources provisionally packaged)
        """
        if not os.path.isdir(self.resource_dir):
            os.makedirs(self.resource_dir)
            # print "Created %s" % self.resource_dir

        if not os.path.isdir(self.publish_dir):
            os.makedirs(self.publish_dir)
            # print "Created %s" % self.publish_dir

        try:
            return self.do_publish()
        except:
            # Something went wrong. Best we can do is clean up end of zip chain.
            zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip"))
            for ze_file in zip_end_files:
                os.remove(ze_file)
                print "error recovery: removed %s" % ze_file

            zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml"))
            for ze_xml in zip_end_xmls:
                os.remove(ze_xml)
                print "error recovery: removed %s" % ze_xml

            zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml"))
            for ze_mani in zip_end_manis:
                os.remove(ze_mani)
                print "error recovery: removed %s" % ze_mani

            # remove zip-end entries from resource-dump.xml
            rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML)
            rs_dump = ResourceDump()
            if os.path.isfile(rs_dump_path):
                with open(rs_dump_path, "r") as rs_dump_file:
                    sm = Sitemap()
                    sm.parse_xml(rs_dump_file, resources=rs_dump)

            prefix = self.publish_url + PREFIX_END_PART

            for uri in rs_dump.resources.keys():
                if uri.startswith(prefix):
                    del rs_dump.resources[uri]
                    print "error recovery: removed %s from %s" % (uri, rs_dump_path)

            with open(rs_dump_path, "w") as rs_dump_file:
                rs_dump_file.write(rs_dump.as_xml())

            print "error recovery: walk through error recovery completed. Now raising ..."
            raise

Example #11

0

Show file

File: syncdirector.py Project: CLARIAH/virtuoso-quad-log

    def synchronize(self):
        """
        Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir.
        """
        if not os.path.isdir(self.source_dir):
            os.makedirs(self.source_dir)
            print "Created %s" % self.source_dir

        if not os.path.isdir(self.sink_dir):
            os.makedirs(self.sink_dir)
            print "Created %s" % self.sink_dir

        self.handshake = self.verify_handshake()
        if self.handshake is None:
            return
        ####################

        # print "Synchronizing state as of %s" % self.handshake

        ### initial resource description
        wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN)
        if not os.path.isdir(wellknown):
            os.makedirs(wellknown)

        src_desc = SourceDescription()
        new_src_desc = True
        # Load existing resource-description, if any.
        if os.path.isfile(self.src_desc_path):
            new_src_desc = False
            with open(self.src_desc_path, "r") as src_desc_file:
                sm = Sitemap()
                sm.parse_xml(src_desc_file, resources=src_desc)

        count_lists = len(src_desc.resources)

        ### resources in subdirectories or main directory
        ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories.
        index_file = os.path.join(self.source_dir, FILE_INDEX)
        if os.path.isfile(index_file):
            for dirname in os.walk(self.source_dir).next()[1]:
                source = os.path.join(self.source_dir, dirname)
                sink = os.path.join(self.sink_dir, dirname)
                publish_url = self.publish_url + dirname + "/"
                self.__execute_sync__(source, sink, publish_url, src_desc)
        else:
            self.__execute_sync__(self.source_dir, self.sink_dir, self.publish_url, src_desc)

        if new_src_desc or count_lists != len(src_desc.resources):
            ### publish resource description
            with open(self.src_desc_path, "w") as src_desc_file:
                src_desc_file.write(src_desc.as_xml())
                print "New resource description. See %s" % self.src_desc_url

        self.report()

Example #12

0

Show file

 def test_21_parse_sitemapindex(self):
     s=Sitemap()
     si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'), sitemapindex=True )
     self.assertEqual( len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['aaa','bbb'] )
     # add a couple more
     s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), resources=si )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )

Example #13

0

Show file

File: client.py Project: semantalytics/resync

    def explore_uri(self, uri, caps):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s=Sitemap()
        print "Reading %s" % (uri)
        try:
            list = s.parse_xml(urllib.urlopen(uri))
        except IOError as e:
            raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e)))
        num_entries = len(list.resources)
        capability = '(unknown capability)'
        if ('capability' in list.md):
            capability = list.md['capability']
        if (s.parsed_index):
            capability += 'index'
        print "Parsed %s document with %d entries:" % (capability,num_entries)
        if (caps is not None and capability not in caps):
            print "WARNING - expected a %s document" % (','.join(caps))
        to_show = num_entries
        if (num_entries>21):
            to_show = 20
        # What entries are allowed? 
        # FIXME - not complete
        if (capability == 'capabilitylistindex'):
            entry_caps = ['capabilitylist']
        elif (capability == 'capabilitylist'):
            entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex']
        elif (capability == 'changelistindex'):
            entry_caps = ['changelist']
        n = 0
        options = {}
        for r in list.resources:
            if (n>=to_show):
                print "(not showing remaining %d entries)" % (num_entries-n)
                last
            n+=1
            options[str(n)]=r
            print "[%d] %s" % (n,r.uri)
            if (r.capability is not None):
                warning = ''
                if (r.capability not in entry_caps):
                    warning = " (EXPECTED %s)" % (' or '.join(entry_caps))
                print "  %s%s" % (r.capability,warning)
            elif (len(entry_caps)==1):
                r.capability=entry_caps[0]
                print "  capability not specified, should be %s" % (r.capability)
        while (True):
            inp = raw_input( "Follow [number or q(uit)]?" )
            if (inp in options.keys()):
                break
            if (inp == 'q'):
                return('','',inp)
        caps = [ options[inp].capability ]
        if (capability == 'capabilitylistindex'):
            # all links should be to capabilitylist documents
            if (caps is None):
                caps = ['capabilitylist']
        return( options[inp].uri, caps, inp )

Example #14

0

Show file

 def parse_document(self):
     """Parse any ResourceSync document and show information
     
     Will use sitemap URI taken either from explicit self.sitemap_name
     or derived from the mappings supplied.
     """
     s = Sitemap()
     self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
     try:
         list = s.parse_xml(urllib.urlopen(self.sitemap))
     except IOError as e:
         raise ClientFatalError("Cannot read document (%s)" % str(e))
     num_entries = len(list.resources)
     capability = '(unknown capability)'
     if ('capability' in list.md):
         capability = list.md['capability']
     print "Parsed %s document with %d entries" % (capability, num_entries)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries > to_show):
             print "Showing first %d entries sorted by URI%s..." % (
                 to_show, override_str)
         n = 0
         for resource in list:
             print '[%d] %s' % (n, str(resource))
             n += 1
             if (n >= to_show):
                 break

Example #15

0

Show file

    def test_19_parse_with_bad_rs_ln(self):
        xmlstart='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist"/>\
<url><loc>http://example.com/file_a</loc>'
        xmlend='</url></urlset>'
        s=Sitemap()
        #
        # missing href
        xml=xmlstart+'<rs:ln rel="duplicate"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # missing rel
        xml=xmlstart+'<rs:ln href="http://example.com/"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad length
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="a"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad pri
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>'+xmlend
        self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # and finally OK with errors fixes
        xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>'+xmlend
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual( len(rc.resources), 1, 'good at last, extra attribute ignored' )

Example #16

0

Show file

def sync_incremental(map, counter, base_url, from_date, to_date):
    """Run resync incremental."""
    # init_logging(verbose=True)
    from .resync import ResourceSyncClient
    client = ResourceSyncClient()
    client.ignore_failures = True
    try:
        single_sync_incremental(map, counter, base_url, from_date, to_date)
        return True
    except MapperError as e:
        current_app.logger.info(e)
        paths = map[0].rsplit('/', 1)
        map[0] = paths[0]
    except Exception as e:
        # maybe url contain a list of changelist, instead of changelist
        current_app.logger.info(e)
        s = Sitemap()
        try:
            docs = s.parse_xml(url_or_file_open(base_url))
        except IOError as ioerror:
            raise ioerror
        if docs:
            for doc in docs:
                # make sure sub url is a changelist/ changedump
                capability = read_capability(doc.uri)
                if capability is None:
                    raise ('Bad URL, not a changelist/changedump,'
                           ' cannot sync incremental')
                if capability != 'changelist' and capability != 'changedump':
                    raise ('Bad URL, not a changelist/changedump,'
                           ' cannot sync incremental')
                single_sync_incremental(map, counter, doc.uri, from_date,
                                        to_date)
            return True
        raise e

Example #17

0

Show file

File: client.py Project: semantalytics/resync

 def parse_document(self):
     """Parse any ResourceSync document and show information
     
     Will use sitemap URI taken either from explicit self.sitemap_name
     or derived from the mappings supplied.
     """
     s=Sitemap()
     self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
     try:
         list = s.parse_xml(urllib.urlopen(self.sitemap))
     except IOError as e:
         raise ClientFatalError("Cannot read document (%s)" % str(e))
     num_entries = len(list.resources)
     capability = '(unknown capability)'
     if ('capability' in list.md):
         capability = list.md['capability']
     print "Parsed %s document with %d entries" % (capability,num_entries)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries>to_show):
             print "Showing first %d entries sorted by URI%s..." % (to_show,override_str)
         n=0
         for resource in list:
             print '[%d] %s' % (n,str(resource))
             n+=1
             if ( n >= to_show ):
                 break

Example #18

0

Show file

    def parse_xml(self,
                  fh=None,
                  etree=None,
                  resources=None,
                  capability=None,
                  sitemapindex=None):
        """Parse XML Sitemap and add to resources object.

        Reads from fh or etree and adds resources to a resorces object
        (which must support the add method). Returns the resources object.

        Also sets self.resources_created to be the number of resources created.
        We adopt a very lax approach here. The parsing is properly namespace
        aware but we search just for the elements wanted and leave everything
        else alone.

        This method will read either sitemap or sitemapindex documents. Behavior
        depends on the sitemapindex parameter:
        - None - will read either
        - False - SitemapIndexError exception if sitemapindex detected
        - True - SitemapIndexError exception if sitemap detected

        Will set self.parsed_index based on whether a sitemap or sitemapindex
        document was read:
        - False - sitemap
        - True - sitemapindex
        """

        sitemap = Sitemap()
        self.res_container = sitemap.parse_xml(fh=fh,
                                               etree=etree,
                                               resources=resources,
                                               capability=capability,
                                               sitemapindex=sitemapindex)
        return self.res_container

Example #19

0

Show file

    def test_18_parse_with_rs_ln_on_resource(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability=\"resourcelist\"/>\
<url>\
  <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md hash=\"md5:r2d2\" length=\"12345\" />\
  <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\
  <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\
  <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\
</url>\
<url>\
  <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md length=\"32\" />\
</url>\
</urlset>'
        s=Sitemap()
        rc=s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse( s.parsed_index, 'was a sitemap')
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
        i = iter(rc)
        r1 = next(i)
        r2 = next(i)
        self.assertEqual( r1.uri, 'http://example.com/file_a' )
        self.assertEqual( r1.ln[0]['rel'], 'duplicate' )
        self.assertEqual( r1.ln[0]['href'], 'http://mirror1.example.com/res1' )
        self.assertEqual( r1.ln[0]['modified'], '2013-01-02' )
        self.assertEqual( r1.ln[0]['pri'], 1 )
        self.assertEqual( r2.uri, 'http://example.com/file_b' )

Example #20

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_19_parse_with_bad_rs_ln(self):
        xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist"/>\
<url><loc>http://example.com/file_a</loc>'

        xmlend = '</url></urlset>'
        s = Sitemap()
        #
        # missing href
        xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # missing rel
        xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad length
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # bad pri
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend
        self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml))
        # and finally OK with errors fixes
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual(len(rc.resources), 1,
                         'good at last, extra attribute ignored')

Example #21

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_18_parse_with_rs_ln_on_resource(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability=\"resourcelist\"/>\
<url>\
  <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md hash=\"md5:r2d2\" length=\"12345\" />\
  <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\
  <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\
  <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\
</url>\
<url>\
  <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\
  <rs:md length=\"32\" />\
</url>\
</urlset>'

        s = Sitemap()
        rc = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 2, 'got 2 resources')
        i = iter(rc)
        r1 = next(i)
        r2 = next(i)
        self.assertEqual(r1.uri, 'http://example.com/file_a')
        self.assertEqual(r1.ln[0]['rel'], 'duplicate')
        self.assertEqual(r1.ln[0]['href'], 'http://mirror1.example.com/res1')
        self.assertEqual(r1.ln[0]['modified'], '2013-01-02')
        self.assertEqual(r1.ln[0]['pri'], 1)
        self.assertEqual(r2.uri, 'http://example.com/file_b')

Example #22

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

 def test_20_parse_sitemapindex_empty(self):
     s = Sitemap()
     si = s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>'
     ),
                      sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 0, '0 sitemaps')

Example #23

0

Show file

 def test_22_parse_sitemapindex_file(self):
     s=Sitemap()
     fh=open('tests/testdata/sitemapindex1/sitemap.xml','r')
     si = s.parse_xml( fh=fh, sitemapindex=True )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )

Example #24

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

 def test_21_parse_sitemapindex(self):
     s = Sitemap()
     si = s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'
     ),
                      sitemapindex=True)
     self.assertEqual(len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual(sms, ['aaa', 'bbb'])
     # add a couple more
     s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'
     ),
                 resources=si)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.uris())
     self.assertEqual(sms, ['aaa', 'bbb', 'cc', 'dd'])

Example #25

0

Show file

File: transport.py Project: EHRI/rspub-core

 def generator():
     for file_name in self.paras.last_sitemaps:
         listbase = ListBaseWithIndex()
         if os.path.exists(file_name):
             with open(file_name, "r", encoding="utf-8") as lb_file:
                 sm = Sitemap()
                 sm.parse_xml(lb_file, resources=listbase)
             for resource in listbase.resources:
                 if resource.change is None or not resource.change == "deleted":
                     path, relpath = self.extract_paths(resource.uri)
                     yield resource, path, relpath
         else:
             LOG.warning("Unable to read sitemap: %s" % file_name)
             self.count_errors += 1
             self.observers_inform(
                 self,
                 ResourceAuditorEvent.site_map_not_found,
                 file=file_name)

Example #26

0

Show file

File: processor.py Project: EHRI/resydes

    def read_source(self):
        """
        Read the source_uri and parse it to source_document.
        :return: True if the document was downloaded and parsed without exceptions, False otherwise.
        """
        session = requests.Session()
        try:
            response = session.get(self.source_uri)
            self.source_status = response.status_code
            self.logger.debug("Read %s, status %s" % (self.source_uri, str(self.source_status)))
            assert self.source_status == 200, "Invalid response status: %d" % self.source_status

            text = response.text

            root = ET.fromstring(text)
            self.is_index = root.tag == SITEMAP_INDEX_ROOT

            etree = ET.ElementTree(root)
            sitemap = Sitemap()
            self.source_document = sitemap.parse_xml(etree=etree)
            # the source_document is a resync.resource_container.ResourceContainer
            capability = self.source_document.capability
            assert capability == self.capability, "Capability is not %s but %s" % (self.capability, capability)
            # anyone interested in sitemaps?
            for processor_listener in processor_listeners:
                processor_listener.event_sitemap_received(self.source_uri, capability, text)

            self.describedby_url = self.source_document.describedby
            self.up_url = self.source_document.up  # to a parent non-index document
            self.index_url = self.source_document.index  # to a parent index document
            self.status = Status.document

        except requests.exceptions.ConnectionError as err:
            self.logger.debug("%s No connection: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except xml.etree.ElementTree.ParseError as err:
            self.logger.debug("%s ParseError: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except resync.sitemap.SitemapParseError as err:
            self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except AssertionError as err:
            self.logger.debug("%s Error: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        finally:
            session.close()

        return self.status == Status.document

Example #27

0

Show file

    def test_11_parse_2(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\
</urlset>'
        s=Sitemap()
        i=s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse( s.parsed_index, 'was a sitemap')
        self.assertEqual( s.resources_created, 2, 'got 2 resources')

Example #28

0

Show file

def read_capability(url):
    """Read capability of an url."""
    s = Sitemap()
    capability = None
    try:
        document = s.parse_xml(url_or_file_open(url))
    except IOError as e:
        raise e
    if 'capability' in document.md:
        capability = document.md['capability']
    return capability

Example #29

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_11_parse_2(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\
</urlset>'

        s = Sitemap()
        i = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 2, 'got 2 resources')

Example #30

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

 def test_22_parse_sitemapindex_file(self):
     s = Sitemap()
     fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r')
     si = s.parse_xml(fh=fh, sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual(sms, [
         'http://localhost:8888/sitemap00000.xml',
         'http://localhost:8888/sitemap00001.xml',
         'http://localhost:8888/sitemap00002.xml'
     ])

Example #31

0

Show file

    def test_13_parse_multi_lastmod(self):
        xml_start='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>uri:a</loc>'
        xml_end='</url></urlset>'
        s=Sitemap()
        two_lastmod='<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>'
        self.assertRaises( SitemapParseError, s.parse_xml, 
                           io.StringIO(xml_start+two_lastmod+xml_end))
        # While it not ideal to omit, <lastmod> is not required and
        # thus either empty lastmod or lastmod with just an attribute
        # and no content are not ambiguous and thus should be accepted
        # with resulting None for resource.lastmod
        mt_lastmod='<lastmod></lastmod>'
        i=s.parse_xml(fh=io.StringIO(xml_start+mt_lastmod+xml_end))
        self.assertEqual( s.resources_created, 1 )
        self.assertEqual( i.resources[0].lastmod, None )
        mt_lastmod_att='<lastmod att="value"/>'
        i=s.parse_xml(fh=io.StringIO(xml_start+mt_lastmod_att+xml_end))
        self.assertEqual( s.resources_created, 1 )
        self.assertEqual( i.resources[0].lastmod, None )

Example #32

0

Show file

File: test_resource_list.py Project: ramonmassip/resync

    def test33_write(self):
        # ResourceList
        rl = ResourceList()

        rl.add(Resource(uri='http://example.com/test/a', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/b', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/c', timestamp=1))

        rl_filename = os.path.join(self.tmpdir,
                                   'test33_write_resourcelist.xml')
        rl.write(basename=rl_filename)

        with open(rl_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertFalse(s.parsed_index)

        # ResourceListIndex
        rli = ResourceList()

        rli.add(
            Resource(uri='http://example.com/test/resourcelist00000.xml',
                     timestamp=1))
        rli.add(
            Resource(uri='http://example.com/test/resourcelist00001.xml',
                     timestamp=1))
        rli.add(
            Resource(uri='http://example.com/test/resourcelist00002.xml',
                     timestamp=1))
        rli.sitemapindex = True

        rli_filename = os.path.join(self.tmpdir,
                                    'test33_write_resourcelist-index.xml')
        rli.write(basename=rli_filename)

        with open(rli_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertTrue(s.parsed_index)

Example #33

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_13_parse_multi_lastmod(self):
        xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>uri:a</loc>'

        xml_end = '</url></urlset>'
        s = Sitemap()
        two_lastmod = '<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>'
        self.assertRaises(SitemapParseError, s.parse_xml,
                          io.StringIO(xml_start + two_lastmod + xml_end))
        # While it not ideal to omit, <lastmod> is not required and
        # thus either empty lastmod or lastmod with just an attribute
        # and no content are not ambiguous and thus should be accepted
        # with resulting None for resource.lastmod
        mt_lastmod = '<lastmod></lastmod>'
        i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod + xml_end))
        self.assertEqual(s.resources_created, 1)
        self.assertEqual(i.resources[0].lastmod, None)
        mt_lastmod_att = '<lastmod att="value"/>'
        i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod_att + xml_end))
        self.assertEqual(s.resources_created, 1)
        self.assertEqual(i.resources[0].lastmod, None)

Example #34

0

Show file

def get_from_date_from_url(url):
    """Get smallest timestamp from url and parse to string."""
    s = Sitemap()
    try:
        document = s.parse_xml(url_or_file_open(url))
    except IOError as e:
        raise e
    date_list = []
    for item in document.resources:
        if item.timestamp:
            date_list.append(item.timestamp)
    if len(date_list) > 0:
        from_date = dt.fromtimestamp(min(date_list))
        return from_date.strftime("%Y-%m-%d")

Example #35

0

Show file

File: changedump.py Project: resourcesync/py-resourcesync

    def update_previous_state(self):
        if self.previous_resources is None:
            self.previous_resources = {}

            # search for resourcelists
            self.resourcelist_files = sorted(
                glob(self.param.abs_metadata_path("changedump_*.xml")))
            for rl_file_name in self.resourcelist_files:
                resourcelist = ResourceList()
                with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                    sm = Sitemap()
                    sm.parse_xml(rl_file, resources=resourcelist)

                self.date_resourcelist_completed = resourcelist.md_completed
                if self.date_resourcelist_completed is None:
                    self.date_resourcelist_completed = resourcelist.md_at

                self.previous_resources.update({
                    resource.uri: resource
                    for resource in resourcelist.resources
                })

            # search for changedumps
            self.changedump_files = sorted(
                glob(self.param.abs_metadata_path("changedump_*.xml")))
            for cl_file_name in self.changedump_files:
                changedump = ChangeDump()
                with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                    sm = Sitemap()
                    sm.parse_xml(cl_file, resources=changedump)

                for resource in changedump.resources:
                    if resource.change == "created" or resource.change == "updated":
                        self.previous_resources.update(
                            {resource.uri: resource})
                    elif resource.change == "deleted" and resource.uri in self.previous_resources:
                        del self.previous_resources[resource.uri]

Example #36

0

Show file

    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s = Sitemap()
        print "Reading %s" % (uri)
        options = {}
        capability = None
        try:
            if (caps == 'resource'):
                self.explore_show_head(uri, check_headers=checks)
            else:
                list = s.parse_xml(urllib.urlopen(uri))
                (options, capability) = self.explore_show_summary(
                    list, s.parsed_index, caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options) == 0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt))
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return ('', '', '', inp)
        checks = {}
        if (options[inp].capability is None):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in [
                    'resourcelist', 'changelist', 'resourcedump', 'changedump'
            ]):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length'] = r.length
            if (r.lastmod is not None):
                checks['last-modified'] = r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return (options[inp].uri, checks, caps, inp)

Example #37

0

Show file

    def test_10_sitemap(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\
</urlset>'
        s=Sitemap()
        i=s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse( s.parsed_index, 'was a sitemap')
        self.assertEqual( s.resources_created, 1, 'got 1 resources')
        for r in i.resources:
            self.assertTrue( r is not None, 'got the uri expected')
            self.assertEqual( r.uri, 'http://e.com/a' )
            self.assertEqual( r.lastmod, '2012-03-14T18:37:36Z' )
            self.assertEqual( r.length, 12 )
            self.assertEqual( r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==' )

Example #38

0

Show file

File: test_examples_from_spec.py Project: semantalytics/resync

 def test_all_simple(self):
     """Just try to read each one"""
     for ex in ("ex_2_1.xml","ex_2_2.xml","ex_2_3.xml","ex_2_4.xml",
                "ex_2_5.xml","ex_2_6.xml","ex_2_7.xml",
                "ex_4_1.xml","ex_4_2.xml","ex_4_3.xml",
                "ex_5_1.xml","ex_5_2.xml","ex_5_3.xml",
                "ex_6_1.xml",
                "ex_7_1.xml","ex_7_2.xml","ex_7_3.xml",
                "ex_8_1.xml","ex_8_2.xml","ex_8_3.xml","ex_8_4.xml",
                "ex_8_5.xml","ex_8_6.xml","ex_8_7.xml","ex_8_8.xml",
                "ex_8_9.xml",
                "ex_9_1.xml","ex_9_2.xml","ex_9_3.xml",
                "ex_10_1.xml","ex_10_2.xml"):
         s=Sitemap()
         fh = open( 'resync/test/testdata/examples_from_spec/%s' % (ex), 'r')
         si = s.parse_xml( fh=fh )

Example #39

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_10_sitemap(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\
</urlset>'

        s = Sitemap()
        i = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 1, 'got 1 resources')
        for r in i.resources:
            self.assertTrue(r is not None, 'got the uri expected')
            self.assertEqual(r.uri, 'http://e.com/a')
            self.assertEqual(r.lastmod, '2012-03-14T18:37:36Z')
            self.assertEqual(r.length, 12)
            self.assertEqual(r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==')

Example #40

0

Show file

File: client.py Project: EHRI/resync

    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s=Sitemap()
        print "Reading %s" % (uri)
        options={}
        capability=None
        try:
            if (caps=='resource'):
                self.explore_show_head(uri,check_headers=checks)
            else: 
                list = s.parse_xml(urllib.urlopen(uri))
                (options,capability)=self.explore_show_summary(list,s.parsed_index,caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri,str(e))
            return('','','','b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri,str(e))
            return('','','','b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options)==0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) )
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return('','','',inp)
        checks = {}
        if ( options[inp].capability is None ):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in ['resourcelist','changelist',
                                 'resourcedump','changedump']):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length']=r.length
            if (r.lastmod is not None):
                checks['last-modified']=r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return( options[inp].uri, checks, caps, inp )

Example #41

0

Show file

    def test_30_parse_change_list(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'
        s=Sitemap()
        s.resource_class=Resource
        c=s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
        i = iter(c)
        r1 = next(i)
        self.assertEqual( r1.uri, '/tmp/rs_test/src/file_a' )
        self.assertEqual( r1.change, 'updated' )
        r2 = next(i)
        self.assertEqual( r2.uri, '/tmp/rs_test/src/file_b' )
        self.assertEqual( r2.change, None )

Example #42

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

    def test_30_parse_change_list(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'

        s = Sitemap()
        s.resource_class = Resource
        c = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual(s.resources_created, 2, 'got 2 resources')
        i = iter(c)
        r1 = next(i)
        self.assertEqual(r1.uri, '/tmp/rs_test/src/file_a')
        self.assertEqual(r1.change, 'updated')
        r2 = next(i)
        self.assertEqual(r2.uri, '/tmp/rs_test/src/file_b')
        self.assertEqual(r2.change, None)

Example #43

0

Show file

 def test_all_simple_read(self):
     """Just try to read each one"""
     for ex in ('archives_ex_2_1','archives_ex_2_2',
                'archives_ex_3_1','archives_ex_3_2',
                'archives_ex_4_1',
                'archives_ex_5_1',
                'archives_ex_6_1',
                'resourcesync_ex_1','resourcesync_ex_2','resourcesync_ex_3',
                'resourcesync_ex_4','resourcesync_ex_5','resourcesync_ex_6',
                'resourcesync_ex_7','resourcesync_ex_8','resourcesync_ex_12',
                'resourcesync_ex_13','resourcesync_ex_14','resourcesync_ex_15',
                'resourcesync_ex_16','resourcesync_ex_17','resourcesync_ex_18',
                'resourcesync_ex_19','resourcesync_ex_20','resourcesync_ex_21',
                'resourcesync_ex_22','resourcesync_ex_23','resourcesync_ex_24',
                'resourcesync_ex_25','resourcesync_ex_26','resourcesync_ex_27',
                'resourcesync_ex_28','resourcesync_ex_29','resourcesync_ex_30',
                'resourcesync_ex_31','resourcesync_ex_32','resourcesync_ex_33'):
         s=Sitemap()
         fh = self._open_ex(ex)
         si = s.parse_xml( fh=fh )

Example #44

0

Show file

File: test_sitemap.py Project: ramonmassip/resync

 def test_21_parse_multi_sitemapindex(self):
     s = Sitemap()
     fh = open('tests/testdata/sitemapindex2/sitemap.xml', 'r')
     si = s.parse_xml(fh=fh, sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 3, '3 sitemaps listed')

Example #45

0

Show file

    def read_source(self):
        """
        Read the source_uri and parse it to source_document.
        :return: True if the document was downloaded and parsed without exceptions, False otherwise.
        """
        session = requests.Session()
        try:
            response = session.get(self.source_uri)
            self.source_status = response.status_code
            self.logger.debug("Read %s, status %s" %
                              (self.source_uri, str(self.source_status)))
            assert self.source_status == 200, "Invalid response status: %d" % self.source_status

            text = response.text

            root = ET.fromstring(text)
            self.is_index = root.tag == SITEMAP_INDEX_ROOT

            etree = ET.ElementTree(root)
            sitemap = Sitemap()
            self.source_document = sitemap.parse_xml(etree=etree)
            # the source_document is a resync.resource_container.ResourceContainer
            capability = self.source_document.capability
            assert capability == self.capability, \
                "Capability is not %s but %s" % (self.capability, capability)
            # anyone interested in sitemaps?
            for processor_listener in processor_listeners:
                processor_listener.event_sitemap_received(
                    self.source_uri, capability, text)

            self.describedby_url = self.source_document.describedby
            self.up_url = self.source_document.up  # to a parent non-index document
            self.index_url = self.source_document.index  # to a parent index document
            self.status = Status.document

        except requests.exceptions.ConnectionError as err:
            self.logger.debug("%s No connection: %s" %
                              (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except xml.etree.ElementTree.ParseError as err:
            self.logger.debug("%s ParseError: %s" %
                              (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except resync.sitemap.SitemapParseError as err:
            self.logger.debug("%s Unreadable source: %s" %
                              (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        except AssertionError as err:
            self.logger.debug("%s Error: %s" % (self.source_uri, str(err)))
            self.status = Status.read_error
            self.__report__(err)

        finally:
            session.close()

        return self.status == Status.document

Example #46

0

Show file

File: executors.py Project: EHRI/rspub-core

 def read_sitemap(self, path, sitemap_instance):
     with open(path, "r", encoding="utf-8") as file:
         sm = Sitemap()
         sm.parse_xml(file, resources=sitemap_instance)
     return sitemap_instance

Example #47

0

Show file

    def base_line(self, unzipdir):
        """
        Synchronize the unzipped contents of a resource dump with the local resources
        :param unzipdir: the directory of the unzipped packed contents.
        :return:
        """
        manifest_file_name = os.path.join(unzipdir, "manifest.xml")
        try:
            sitemap = Sitemap()
            manifest_doc = sitemap.parse_xml(fh=manifest_file_name)
            # the manifest_doc is a resync.resource_container.ResourceContainer
            capability = manifest_doc.capability
            assert capability == CAPA_RESOURCEDUMP_MANIFEST, "Capability is not %s but %s" % (
                CAPA_RESOURCEDUMP_MANIFEST, capability)
            self.status = Status.parsed
            self.__inform_sitemap_received__(capability, manifest_file_name)

            config = Config()
            netloc = config.boolean_prop(Config.key_use_netloc, False)
            base_uri, destination = DestinationMap().find_destination(
                self.pack_uri, netloc=netloc)
            assert destination is not None, "Found no destination folder in DestinationMap"
            mapper = Mapper((base_uri, destination))
            rlb = ResourceListBuilder(mapper=mapper)
            dst_resource_list = rlb.from_disk()
            # Compares on uri
            same, updated, deleted, created = dst_resource_list.compare(
                manifest_doc)

            raise NotImplementedError("This class is not fully implemented.")

            print(len(same), len(updated), len(deleted), len(created))

            print("same")
            for resource in same:
                print(resource)
            print("updated")
            for resource in updated:
                print(resource)
            print("deleted")
            for resource in deleted:
                print(resource)
            print("created")
            for resource in created:
                print(resource)
                base_uri, local_path = DestinationMap().find_local_path(
                    resource.uri)
                print(base_uri, local_path)

        except AssertionError as err:
            self.logger.debug("%s Error: %s" % (self.pack_uri, str(err)))
            self.status = Status.parse_error
            self.exceptions.append(err)
        except SitemapParseError as err:
            self.logger.debug("%s Unreadable source: %s" %
                              (self.source_uri, str(err)))
            self.status = Status.parse_error
            self.exceptions.append(err)

        self.status = Status.processed_with_exceptions if self.has_exceptions(
        ) else Status.processed

Example #48

0

Show file

 def test_20_parse_sitemapindex_empty(self):
     s=Sitemap()
     si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>'), sitemapindex=True )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 0, '0 sitemaps')

Example #49

0

Show file

 def test_21_parse_multi_sitemapindex(self):
     s = Sitemap()
     fh=open('tests/testdata/sitemapindex2/sitemap.xml','r')
     si = s.parse_xml( fh=fh, sitemapindex=True )
     self.assertTrue( s.parsed_index, 'was a sitemapindex')
     self.assertEqual( len(si.resources), 3, '3 sitemaps listed')

Example #50

0

Show file

File: explorer.py Project: uweschmitt/resync

    def explore_uri(self, explorer_resource, show_back=True):
        """INTERACTIVE exploration of capabilities document(s) starting at a given URI

        Will flag warnings if the document is not of type listed in caps
        """
        uri = explorer_resource.uri
        caps = explorer_resource.acceptable_capabilities
        checks = explorer_resource.checks
        print "Reading %s" % (uri)
        options={}
        capability=None
        try:
            if (caps=='resource'):
                # Not expecting a capability document
                self.explore_show_head(uri,check_headers=checks)
            else: 
                s=Sitemap()
                list = s.parse_xml(urllib.urlopen(uri))
                (options,capability)=self.explore_show_summary(list,s.parsed_index,caps,context=uri)
        except IOError as e:
            print "Cannot read %s (%s)" % (uri,str(e))
        except Exception as e:
            print "Cannot parse %s (%s)" % (uri,str(e))
        #
        # Loop until we have some valide input
        #
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options)==0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            input = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) )
            if (input in options.keys()):
                break
            if (input == 'q'):
                raise ExplorerQuit()
            if (input == 'b'):
                return(None)
        #
        # Got input that is one of the options
        #
        checks = {}
        r = options[input]
        if ( r.capability is None ):
            if (capability in ['resourcelist','changelist',
                               'resourcedump','changedump']):
                caps = 'resource'
            else:
                caps = self.allowed_entries(capability)
        elif (r.capability is 'resource'):
            caps = r.capability
        else:
            caps = [r.capability]
        # Record anything we know about the resource to check
        if (r.length is not None):
            checks['content-length']=r.length
        if (r.lastmod is not None):
            checks['last-modified']=r.lastmod
        if (r.mime_type is not None):
            checks['content-type']=r.mime_type
        # FIXME - could add fixity checks here too
        return( XResource(options[input].uri, caps, checks) )