Example #1
0
    def connect(self, args):
        """Connect to the server"""
        self.path = os.path.abspath(args['path'])
        self.loadconfig(self.path)

        if args['credentials'] is not None:
            cred = args['credentials']
        else:
            if 'user' in self.config and 'pass' in self.config:
                cred = self.config['user'] + ":" + self.config['pass']
            else:
                cred = None

        try:
            adminuser, adminpass = re.split(":", cred)
        except ValueError:
            raise RuntimeError("Invalid credentials (must be user:pass): {}" \
                                   .format(args['credentials']))

        if args['debug']:
            logging.basicConfig(level=logging.WARNING)
            logging.getLogger("requests").setLevel(logging.INFO)
            logging.getLogger("marklogic").setLevel(logging.DEBUG)

        self.batchsize = args['batchsize']
        self.database = args['database']
        self.dryrun = args['dryrun']
        self.list = args['list']
        self.mirror = args['mirror']
        self.regex = args['regex']
        self.root = args['root']
        self.threshold = args['threshold']
        self.verbose = args['verbose']

        if self.list and self.regex:
            raise RuntimeError("You must not specify both --regex and --list")

        if self.root.endswith("/"):
            self.root = self.root[0:len(self.root) - 1]

        if args['hostname'] is None:
            if 'host' in self.config:
                self.hostname = self.config['host']
                if 'port' in self.config:
                    self.port = self.config['port']
                else:
                    self.port = 8000
                if 'management-port' in self.config:
                    self.management_port = self.config['management-port']
                else:
                    self.management_port = 8002
        else:
            parts = args['hostname'].split(":")
            self.hostname = parts.pop(0)
            self.management_port = 8002
            self.port = 8000
            if parts:
                self.management_port = parts.pop(0)
            if parts:
                self.port = parts.pop(0)

        self.connection \
          = Connection(self.hostname, HTTPDigestAuth(adminuser, adminpass), \
                           port=self.port, management_port=self.management_port)

        self.utils = ClientUtils(self.connection)
Example #2
0
    def connect(self, args):
        """Connect to the server"""
        self.path = os.path.abspath(args['path'])
        self.loadconfig(self.path)

        if args['credentials'] is not None:
            cred = args['credentials']
        else:
            if 'user' in self.config and 'pass' in self.config:
                cred = self.config['user'] + ":" + self.config['pass']
            else:
                cred = None

        try:
            adminuser, adminpass = re.split(":", cred)
        except ValueError:
            raise RuntimeError("Invalid credentials (must be user:pass): {}" \
                                   .format(args['credentials']))

        if args['debug']:
            logging.basicConfig(level=logging.WARNING)
            logging.getLogger("requests").setLevel(logging.INFO)
            logging.getLogger("marklogic").setLevel(logging.DEBUG)

        self.batchsize = args['batchsize']
        self.database = args['database']
        self.dryrun = args['dryrun']
        self.list = args['list']
        self.mirror = args['mirror']
        self.regex = args['regex']
        self.root = args['root']
        self.threshold = args['threshold']
        self.verbose = args['verbose']

        if self.list and self.regex:
            raise RuntimeError("You must not specify both --regex and --list")

        if self.root.endswith("/"):
            self.root = self.root[0:len(self.root)-1]

        if args['hostname'] is None:
            if 'host' in self.config:
                self.hostname = self.config['host']
                if 'port' in self.config:
                    self.port = self.config['port']
                else:
                    self.port = 8000
                if 'management-port' in self.config:
                    self.management_port = self.config['management-port']
                else:
                    self.management_port = 8002
        else:
            parts = args['hostname'].split(":")
            self.hostname = parts.pop(0)
            self.management_port = 8002
            self.port = 8000
            if parts:
                self.management_port = parts.pop(0)
            if parts:
                self.port = parts.pop(0)

        self.connection \
          = Connection(self.hostname, HTTPDigestAuth(adminuser, adminpass), \
                           port=self.port, management_port=self.management_port)

        self.utils = ClientUtils(self.connection)
Example #3
0
class MarkLogicDatabaseMirror:
    def __init__(self):
        self.batchsize = BATCHSIZE
        self.cdir = None
        self.config = None
        self.connection = None
        self.database = None
        self.dryrun = False
        self.hostname = None
        self.list = None
        self.mdir = None
        self.mirror = False
        self.path = None
        self.port = None
        self.management_port = None
        self.regex = []
        self.root = None
        self.threshold = BULKTHRESHOLD
        self.ucdir = None
        self.umdir = None
        self.utils = None
        self.verbose = False
        self.logger = logging.getLogger("marklogic.examples.mldbmirror")

    def connect(self, args):
        """Connect to the server"""
        self.path = os.path.abspath(args['path'])
        self.loadconfig(self.path)

        if args['credentials'] is not None:
            cred = args['credentials']
        else:
            if 'user' in self.config and 'pass' in self.config:
                cred = self.config['user'] + ":" + self.config['pass']
            else:
                cred = None

        try:
            adminuser, adminpass = re.split(":", cred)
        except ValueError:
            raise RuntimeError("Invalid credentials (must be user:pass): {}" \
                                   .format(args['credentials']))

        if args['debug']:
            logging.basicConfig(level=logging.WARNING)
            logging.getLogger("requests").setLevel(logging.INFO)
            logging.getLogger("marklogic").setLevel(logging.DEBUG)

        self.batchsize = args['batchsize']
        self.database = args['database']
        self.dryrun = args['dryrun']
        self.list = args['list']
        self.mirror = args['mirror']
        self.regex = args['regex']
        self.root = args['root']
        self.threshold = args['threshold']
        self.verbose = args['verbose']

        if self.list and self.regex:
            raise RuntimeError("You must not specify both --regex and --list")

        if self.root.endswith("/"):
            self.root = self.root[0:len(self.root) - 1]

        if args['hostname'] is None:
            if 'host' in self.config:
                self.hostname = self.config['host']
                if 'port' in self.config:
                    self.port = self.config['port']
                else:
                    self.port = 8000
                if 'management-port' in self.config:
                    self.management_port = self.config['management-port']
                else:
                    self.management_port = 8002
        else:
            parts = args['hostname'].split(":")
            self.hostname = parts.pop(0)
            self.management_port = 8002
            self.port = 8000
            if parts:
                self.management_port = parts.pop(0)
            if parts:
                self.port = parts.pop(0)

        self.connection \
          = Connection(self.hostname, HTTPDigestAuth(adminuser, adminpass), \
                           port=self.port, management_port=self.management_port)

        self.utils = ClientUtils(self.connection)

    def upload(self):
        """Upload data"""
        trans = Transactions(self.connection)
        if not self.dryrun:
            trans.set_database(self.database)
            trans.set_timeLimit(trans.max_timeLimit())
            trans.create()
            txid = trans.txid()

        mirror = True
        for name in os.listdir(self.path):
            if not name in ['content', 'metadata', 'ucontent', 'umetadata']:
                mirror = False

        if self.mirror and not mirror:
            raise RuntimeError("Path doesn't point to a mirror directory")

        try:
            if mirror:
                self._upload_mirror(trans)
            else:
                self._upload_directory(trans)
        except KeyboardInterrupt:
            if not self.dryrun:
                trans.rollback()
        except:
            if not self.dryrun:
                trans.rollback()
            raise
        else:
            if not self.dryrun:
                trans.commit()

    def _upload_mirror(self, trans):
        """Internal method for uploading a mirror."""

        upload_map = {}

        # Before we start, make sure the mirror isn't corrupted.
        # We check that every content file has a corresponding metadata file.
        # We don't check the other way around; if you delete some content, you
        # don't have to delete the corresponding metadata.
        print("Reading files from filesystem...")

        cpath = "{}/content".format(self.path)
        mpath = "{}/metadata".format(self.path)
        missing = []
        if os.path.exists(cpath):
            files = self.scan(cpath, root=cpath)
            for check in files:
                if os.path.exists(mpath + check):
                    upload_map[check] = {"content": cpath + check, \
                                             "metadata": mpath + check}
                else:
                    missing.append(mpath + check)

        cpath = "{}/ucontent".format(self.path)
        mpath = "{}/umetadata".format(self.path)
        if os.path.exists(cpath):
            files = self.scan(cpath, root=cpath)
            for check in files:
                if os.path.exists(mpath + check):
                    upload_map[check] = {"content": cpath + check, \
                                             "metadata": mpath + check, \
                                             "uuid": True}
                else:
                    missing.append(mpath + check)

        if missing:
            print("Missing files:", missing)
            raise RuntimeError("Mirror corrupt")

        self._upload_map(trans, upload_map)

    def _upload_directory(self, trans):
        """Internal method for uploading a directory."""

        print("Reading files from filesystem...")
        allfiles = self.scan(self.path)

        if self.regex:
            files = self.regex_filter(allfiles)
        elif self.list:
            files = self.list_filter(allfiles)
        else:
            files = allfiles

        if len(files) != len(allfiles):
            print("Selected {} of {} files from filesystem..."\
                      .format(len(files),len(allfiles)))

        upload_map = {}
        for check in files:
            upload_map[check] = {"content": self.path + check}

        self._upload_map(trans, upload_map)

    def _upload_map(self, trans, upload_map):
        """Upload from an internally constructed map."""
        print("Reading URIs from server...")
        uris = self.utils.uris(self.database)
        urihash = {}
        for uri in uris:
            urihash[uri] = 1

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(list(upload_map))
        if not stamps:
            print("No timestamps, assuming all files newer.")
        else:
            uptodate = []
            for key in upload_map:
                if key in stamps:
                    source = upload_map[key]['content']
                    statinfo = os.stat(source)
                    stamp = self._convert_timestamp(stamps[key])
                    if statinfo.st_mtime < stamp.timestamp():
                        uptodate.append(key)

            if uptodate:
                print("{} documents are up-to-date...".format(len(uptodate)))
                for key in uptodate:
                    del upload_map[key]
                    del urihash[
                        key]  # remove it from this list so we don't delete it

        upload_count = len(upload_map)
        print("Uploading {} files...".format(upload_count))

        docs = Documents(self.connection)

        bulk = BulkLoader(self.connection)
        bulk.set_database(self.database)
        bulk.set_txid(trans.txid())

        files = list(upload_map.keys())
        done = not files
        upload_size = 0
        ulcount = 0
        while not done:
            doc = files.pop(0)
            done = not files

            docs.clear()

            source = upload_map[doc]['content']
            target = self.root + doc

            body_content_type = "application/octet-stream"

            if 'metadata' in upload_map[doc]:
                metasource = upload_map[doc]['metadata']
                metaxml = ET.parse(metasource)
                root = metaxml.getroot()

                txml = root \
                  .find("{http://marklogic.com/ns/mldbmirror/}content-type")
                if txml is not None:
                    body_content_type = txml.text
                    root.remove(txml)

                if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']:
                    txml = root \
                      .find("{http://marklogic.com/ns/mldbmirror/}uri")
                    if txml is None:
                        raise RuntimeError("No URI provided in metadata.")
                    else:
                        target = txml.text
                        root.remove(txml)

                text = ET.tostring(root, encoding="unicode", method="xml")
                docs.set_metadata(text, "application/xml")
            else:
                metasource = None
                collections = []
                permissions = []

                for cfg in self.config["config"]:
                    if type(cfg["match"]) is list:
                        matches = cfg["match"]
                    else:
                        matches = [cfg["match"]]

                    for match in matches:
                        if re.match(match, target):
                            if "content-type" in cfg:
                                body_content_type = cfg["content-type"]
                            if "permissions" in cfg:
                                permissions = cfg["permissions"]
                            if "permissions+" in cfg:
                                permissions = permissions + cfg["permissions+"]
                            if "collections" in cfg:
                                collections = cfg["collections"]
                            if "collections+" in cfg:
                                collections = collections + cfg["collections+"]

                docs.set_collections(collections)
                docs.set_permissions(None)
                for perm in permissions:
                    for key in perm:
                        docs.add_permission(key, perm[key])

            if target in urihash:
                del urihash[target]

            ulcount += 1
            statinfo = os.stat(source)
            upload_size += statinfo.st_size

            docs.set_uri(target)

            datafile = open(source, "rb")
            docs.set_content(datafile.read(), body_content_type)
            datafile.close()

            bulk.add(docs)

            if self.verbose:
                print("-> {}".format(target))

            if upload_size > self.threshold:
                perc = (float(ulcount) / upload_count) * 100.0
                print("{0:.0f}% ... {1} files, {2} bytes" \
                          .format(perc, bulk.size(), upload_size))
                if self.dryrun:
                    bulk.clear_content()
                else:
                    bulk.post()
                upload_size = 0

        if bulk.size() > 0:
            perc = (float(ulcount) / upload_count) * 100.0
            print("{0:.0f}% ... {1} files, {2} bytes" \
                      .format(perc, bulk.size(), upload_size))
            if self.dryrun:
                bulk.clear_content()
            else:
                bulk.post()

        docs.clear()
        docs.set_txid(trans.txid())
        docs.set_database(self.database)
        delcount = 0
        for uri in urihash:
            if uri.startswith(self.root):
                if self.verbose:
                    print("DEL {}".format(uri))
                docs.add_uri(uri)
                delcount += 1

        if delcount > 0:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(delcount))
            else:
                print("Deleting {} URIs...".format(delcount))
                if not self.dryrun:
                    docs.delete()

    def download(self):
        """Download data"""
        trans = Transactions(self.connection)
        if not self.dryrun:
            trans.set_database(self.database)
            trans.set_timeLimit(trans.max_timeLimit())
            trans.create()

        try:
            if self.mirror:
                self._download_mirror(trans)
            else:
                self._download_directory(trans)
        except KeyboardInterrupt:
            if not self.dryrun:
                trans.rollback()
        except:
            if not self.dryrun:
                trans.rollback()
            raise
        else:
            if not self.dryrun:
                trans.commit()

    def _download_mirror(self, trans):
        """Download mirror"""
        if not os.path.isdir(self.path):
            print("Target directory must exist: {}".format(self.path))
            sys.exit(1)

        if os.listdir(self.path):
            print("Target directory must be empty: {}".format(self.path))
            sys.exit(1)

        self.logger.debug("Starting download_mirror")

        print("Reading URIs from server...")
        alluris = self.utils.uris(self.database, self.root)

        if self.regex:
            uris = self.regex_filter(alluris, download=True)
        elif self.list:
            uris = self.list_filter(alluris, download=True)
        else:
            uris = alluris

        cdir = "{}/content".format(self.path)
        mdir = "{}/metadata".format(self.path)
        ucdir = "{}/ucontent".format(self.path)
        umdir = "{}/umetadata".format(self.path)
        down_map = {}

        for uri in uris:
            if self.can_store_on_filesystem(uri):
                down_map[uri] = {"content": cdir + uri, \
                                     "metadata": mdir + uri}
            else:
                pos = uri.rfind(".")
                if pos > 0:
                    ext = uri[pos:]
                else:
                    ext = ""

                # Put some path segments at the front so that we don't wind up
                # with a single directory containing a gazillion files
                fnuuid = str(uuid.uuid4())
                fnuuid = fnuuid[0:2] + "/" + fnuuid[2:4] + "/" + fnuuid[4:]
                filename = "/{}{}".format(fnuuid, ext)

                down_map[uri] = {"content": ucdir + filename, \
                                     "metadata": umdir + filename, \
                                     "uuid": True}

        self._download_map(trans, down_map)

    def _download_directory(self, trans):
        """Download directory"""
        if not os.path.isdir(self.path):
            print("Target directory must exist: {}".format(self.path))
            sys.exit(1)

        self.logger.debug("Starting download_directory")

        print("Reading URIs from server...")
        alluris = self.utils.uris(self.database, self.root)

        if self.regex:
            uris = self.regex_filter(alluris, download=True)
        elif self.list:
            uris = self.list_filter(alluris, download=True)
        else:
            uris = alluris

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(alluris)

        down_map = {}
        skip_list = []
        for uri in uris:
            if not self.can_store_on_filesystem(uri):
                raise RuntimeError("Cannot save URI:", uri)

            localfile = self.path + uri
            skip = False
            if uri in stamps and os.path.exists(localfile):
                statinfo = os.stat(localfile)
                stamp = self._convert_timestamp(stamps[uri])
                skip = statinfo.st_mtime >= stamp.timestamp()

            if skip:
                skip_list.append(localfile)
            else:
                down_map[uri] = {"content": localfile}
                if uri in stamps:
                    down_map[uri]['timestamp'] = stamps[uri]

        if len(skip_list) > 0:
            print("Skipping {} locally up-to-date files".format(
                len(skip_list)))

        self._download_map(trans, down_map, skip_list)

    def _download_map(self, trans, down_map, skip_list=[]):
        """Download from an internally constructed map."""
        filehash = {}
        if not self.mirror:
            print("Reading files from filesystem...")
            for path in self.scan(self.path):
                filehash[path] = 1

        self.logger.debug("Downloading map")
        self.logger.debug(down_map)

        download_count = len(down_map)

        print("Downloading {} documents...".format(download_count))

        docs = Documents(self.connection)
        docs.set_database(self.database)
        docs.set_txid(trans.txid())
        docs.set_format('xml')
        docs.set_accept("multipart/mixed")
        if self.mirror:
            docs.set_categories(['content', 'metadata'])
        else:
            docs.set_category('content')

        dlprog = 0
        dlcount = 0
        for uri in down_map.keys():
            dlcount += 1
            docs.add_uri(uri)

            if uri in filehash:
                del filehash[uri]

            if dlcount >= self.batchsize:
                dlprog += dlcount
                perc = (float(dlprog) / download_count) * 100.0
                print("{0:.0f}% ... {1}/{2} files" \
                          .format(perc, dlprog, download_count))

                self._download_batch(docs, down_map)
                docs.clear()
                docs.set_database(self.database)
                docs.set_txid(trans.txid())
                docs.set_format('xml')
                docs.set_accept("multipart/mixed")
                if self.mirror:
                    docs.set_categories(['content', 'metadata'])
                else:
                    docs.set_category('content')
                dlcount = 0

        if dlcount > 0:
            perc = (float(dlcount) / download_count) * 100.0
            print("{0:.0f}% ... {1} files".format(perc, dlcount))
            self._download_batch(docs, down_map)

        delfiles = []
        for path in filehash.keys():
            localfile = self.path + path
            if localfile in skip_list:
                pass
            else:
                delfiles.append(localfile)

        if not self.mirror and delfiles:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(len(delfiles)))
            else:
                print("Deleting {} files...".format(len(delfiles)))
                if not self.dryrun:
                    for path in delfiles:
                        os.remove(path)
                    self._remove_empty_dirs(self.path)

    def _download_batch(self, docs, down_map):
        """Download a batch of files"""
        if self.dryrun:
            return

        self.logger.debug("Downloading batch")
        self.logger.debug(docs)

        resp = docs.get()
        decoder = MultipartDecoder.from_response(resp)

        self.logger.debug("Downloaded {} bytes in {} parts" \
                              .format(len(resp.text), len(decoder.parts)))

        meta_part = None
        content_part = None
        splitregex = re.compile(';\\s*')

        if not decoder.parts:
            raise RuntimeError("FAILED TO GET ANY PARTS!?")

        for mimepart in decoder.parts:
            disp = mimepart.headers[b'Content-Disposition'].decode('utf-8')
            if 'category=metadata' in disp:
                if meta_part is not None:
                    raise RuntimeError("More than one metadata part!?")
                meta_part = mimepart
            else:
                content_part = mimepart

                disp = content_part.headers[b'Content-Disposition'].decode(
                    'utf-8')
                dispositions = splitregex.split(disp)
                filename = None
                for disp in dispositions:
                    if disp.startswith("filename="):
                        filename = disp[10:len(disp) - 1]

                body_content_type = content_part.headers[
                    b'Content-Type'].decode('utf-8')

                if filename is None:
                    raise RuntimeError("Multipart without filename!?")

                #print("FN:",filename)

                last_modified = None
                stanza = down_map[filename]
                if meta_part is not None:
                    last_modified = self._store_metadata(meta_part, down_map[filename], \
                                                             body_content_type, filename)
                    if last_modified is not None:
                        stanza['timestamp'] = last_modified
                self._store_content(content_part, stanza)
                meta_part = None
                content_part = None

    def _store_metadata(self, meta_part, stanza, body_content_type, uri):
        # fromstring() doesn't return an an xml.etree.ElementTree and
        # doesn't have a write method that we're going to need later
        fakeio = io.StringIO(meta_part.content.decode('utf-8'))
        metaxml = ET.parse(fakeio)
        root = metaxml.getroot()
        fakeio.close()

        last_mod = None
        properties = root.find(
            '{http://marklogic.com/xdmp/property}properties')
        if properties is not None:
            last_mod = properties.find(
                '{http://marklogic.com/xdmp/property}last-modified')
            if last_mod is not None:
                last_mod = last_mod.text

        txml = ET.Element("{http://marklogic.com/ns/mldbmirror/}content-type")
        txml.text = body_content_type
        root.insert(0, txml)

        metafn = stanza['metadata']
        if 'uuid' in stanza:
            txml = ET.Element("{http://marklogic.com/ns/mldbmirror/}uri")
            txml.text = uri
            root.insert(1, txml)

        if not self.dryrun:
            if not os.path.exists(os.path.dirname(metafn)):
                os.makedirs(os.path.dirname(metafn))

            metaxml.write(metafn)
        else:
            if self.verbose:
                print("Meta:", metafn)

        return last_mod

    def _store_content(self, content_part, stanza):
        contfn = stanza['content']

        #print(stanza)

        if 'timestamp' in stanza:
            stamp = self._convert_timestamp(stanza['timestamp'])
        else:
            stamp = None

        if not self.dryrun:
            if not os.path.exists(os.path.dirname(contfn)):
                os.makedirs(os.path.dirname(contfn))

            dataf = open(contfn, 'wb')
            dataf.write(content_part.content)
            dataf.close()
            if stamp is not None:
                os.utime(contfn, (stamp.timestamp(), stamp.timestamp()))
        else:
            if self.verbose:
                print("Data:", contfn)

    def can_store_on_filesystem(self, filename):
        """Returns true if the filename can be stored.

        Many MarkLogic URIs (e.g, /path/to/file.xml) can be stored on the filesystem.
        Many others (e.g., http://...) can not. This method returns true iff the
        filename is a legitimate filesystem name.

        N.B. If a filename does not begin with "/", it cannot be stored on the
        filesystem because if it's ever uploaded, it'll get a leading /.
        """
        if (not filename.startswith("/")) or ("//" in filename) \
          or (":" in filename) or ('"' in filename) or ('"' in filename) \
          or ("\\" in filename):
            return False
        else:
            return True

    def list_filter(self, alluris, download=False):
        if self.list:
            uris = []
            print("Reading list from {}...".format(self.list))
            listfile = open(self.list, "r")
            for line in listfile:
                line = line.rstrip()
                if line in alluris:
                    if self.dryrun and self.verbose:
                        print("INCL:", line)
                    uris.append(line)
                else:
                    print("Not available:", line)

            if download:
                cat = "files"
            else:
                cat = "URIs"

            print("File list reduced {} {} to {}." \
                      .format(len(alluris), cat, len(uris)))

            return uris
        else:
            return alluris

    def get_timestamps(self, uris):
        """Get the database timestamp for these URIs"""
        chunksize = 500
        chunkcount = math.floor((len(uris) + chunksize - 1) / chunksize)
        chunk = 0

        stamps = []
        for seg in self._chunks(uris, chunksize):
            chunk += 1
            if self.verbose:
                print("\t-> chunk {} of {} ...".format(chunk, chunkcount))
            stamps += self.utils.last_modified(self.database, seg)

        # FIXME: make the XQuery return a single object
        stamp_hash = {}
        for stamp in stamps:
            stamp_hash[stamp["uri"]] = stamp["dt"]
        return stamp_hash

    def _chunks(self, uris, n):
        """Chop a long list into a list of lists"""
        for index in range(0, len(uris), n):
            yield uris[index:index + n]

    def regex_filter(self, alluris, download=False):
        if self.regex:
            uris = []
            cregex = []
            for exp in self.regex:
                cregex.append(re.compile(exp))

            for uri in alluris:
                match = False
                for exp in cregex:
                    match = match or exp.match(uri)

                if match:
                    if self.dryrun and self.verbose:
                        print("INCL:", uri)
                    uris.append(uri)
                else:
                    if self.dryrun and self.verbose:
                        print("EXCL:", uri)

            if download:
                cat = "URIs"
            else:
                cat = "files"

            if len(self.regex) == 1:
                print("Regex filter reduced {} {} to {}." \
                          .format(len(alluris), cat, len(uris)))
            elif len(self.regex) > 1:
                print("{} regex filters reduced {} {} to {}." \
                          .format(len(self.regex), cat, len(alluris), len(uris)))

            return uris
        else:
            return alluris

    def _remove_empty_dirs(self, directory):
        """Remove empty directories recursively"""
        for name in os.listdir(directory):
            path = os.path.join(directory, name)
            if os.path.isdir(path):
                self._remove_empty_dirs(path)

        if not os.listdir(directory):
            os.rmdir(directory)

    def scan(self, directory, root=None, files=None):
        """Scan a directory recursively, returning all of the files"""
        if files is None:
            files = []
        if root is None:
            root = directory
        skip = "/" + CONFIGFILE

        for name in os.listdir(directory):
            path = os.path.join(directory, name)
            if os.path.isfile(path):
                if not path.endswith(skip):
                    path = path[len(root):]
                    files.append(path)
            else:
                self.scan(path, root, files)
        return files

    def loadconfig(self, path):
        """Load the configuration file that determines document properties"""
        config = {}
        localconfig = {}

        home = os.path.expanduser("~")
        cfgfile = home + "/" + CONFIGFILE
        if os.path.isfile(cfgfile):
            data = open(cfgfile).read()
            config = json.loads(data)

        cfgfile = path + "/" + CONFIGFILE
        if os.path.isfile(cfgfile):
            data = open(cfgfile).read()
            localconfig = json.loads(data)

        for key in localconfig:
            if key != 'config':
                config[key] = localconfig[key]

        if 'config' not in config:
            config['config'] = []

        if 'config' in localconfig:
            config['config'] = config["config"] + localconfig['config']

        self.config = config

    def _convert_timestamp(self, stamp):
        """ Convert ISO 8601 dateTime to a, uh, datetime """
        if stamp.endswith("Z"):
            stamp = stamp[0:len(stamp) - 1] + "+0000"
        else:
            # Convert ...+05:00 to ...+0500
            stamp = stamp[0:len(stamp) - 3] + stamp[len(stamp) - 2:len(stamp)]
        stamp = datetime.strptime(stamp, "%Y-%m-%dT%H:%M:%S%z")
        return stamp
Example #4
0
class MarkLogicDatabaseMirror:
    def __init__(self):
        self.batchsize = BATCHSIZE
        self.cdir = None
        self.config = None
        self.connection = None
        self.database = None
        self.dryrun = False
        self.hostname = None
        self.list = None
        self.mdir = None
        self.mirror = False
        self.path = None
        self.port = None
        self.management_port = None
        self.regex = []
        self.root = None
        self.threshold = BULKTHRESHOLD
        self.ucdir = None
        self.umdir = None
        self.utils = None
        self.verbose = False
        self.logger = logging.getLogger("marklogic.examples.mldbmirror")

    def connect(self, args):
        """Connect to the server"""
        self.path = os.path.abspath(args['path'])
        self.loadconfig(self.path)

        if args['credentials'] is not None:
            cred = args['credentials']
        else:
            if 'user' in self.config and 'pass' in self.config:
                cred = self.config['user'] + ":" + self.config['pass']
            else:
                cred = None

        try:
            adminuser, adminpass = re.split(":", cred)
        except ValueError:
            raise RuntimeError("Invalid credentials (must be user:pass): {}" \
                                   .format(args['credentials']))

        if args['debug']:
            logging.basicConfig(level=logging.WARNING)
            logging.getLogger("requests").setLevel(logging.INFO)
            logging.getLogger("marklogic").setLevel(logging.DEBUG)

        self.batchsize = args['batchsize']
        self.database = args['database']
        self.dryrun = args['dryrun']
        self.list = args['list']
        self.mirror = args['mirror']
        self.regex = args['regex']
        self.root = args['root']
        self.threshold = args['threshold']
        self.verbose = args['verbose']

        if self.list and self.regex:
            raise RuntimeError("You must not specify both --regex and --list")

        if self.root.endswith("/"):
            self.root = self.root[0:len(self.root)-1]

        if args['hostname'] is None:
            if 'host' in self.config:
                self.hostname = self.config['host']
                if 'port' in self.config:
                    self.port = self.config['port']
                else:
                    self.port = 8000
                if 'management-port' in self.config:
                    self.management_port = self.config['management-port']
                else:
                    self.management_port = 8002
        else:
            parts = args['hostname'].split(":")
            self.hostname = parts.pop(0)
            self.management_port = 8002
            self.port = 8000
            if parts:
                self.management_port = parts.pop(0)
            if parts:
                self.port = parts.pop(0)

        self.connection \
          = Connection(self.hostname, HTTPDigestAuth(adminuser, adminpass), \
                           port=self.port, management_port=self.management_port)

        self.utils = ClientUtils(self.connection)

    def upload(self):
        """Upload data"""
        trans = Transactions(self.connection)
        if not self.dryrun:
            trans.set_database(self.database)
            trans.set_timeLimit(trans.max_timeLimit())
            trans.create()
            txid = trans.txid()

        mirror = True
        for name in os.listdir(self.path):
            if not name in ['content', 'metadata', 'ucontent', 'umetadata']:
                mirror = False

        if self.mirror and not mirror:
            raise RuntimeError("Path doesn't point to a mirror directory")

        try:
            if mirror:
                self._upload_mirror(trans)
            else:
                self._upload_directory(trans)
        except KeyboardInterrupt:
            if not self.dryrun:
                trans.rollback()
        except:
            if not self.dryrun:
                trans.rollback()
            raise
        else:
            if not self.dryrun:
                trans.commit()

    def _upload_mirror(self, trans):
        """Internal method for uploading a mirror."""

        upload_map = {}

        # Before we start, make sure the mirror isn't corrupted.
        # We check that every content file has a corresponding metadata file.
        # We don't check the other way around; if you delete some content, you
        # don't have to delete the corresponding metadata.
        print("Reading files from filesystem...")

        cpath = "{}/content".format(self.path)
        mpath = "{}/metadata".format(self.path)
        missing = []
        if os.path.exists(cpath):
            files = self.scan(cpath, root=cpath)
            for check in files:
                if os.path.exists(mpath + check):
                    upload_map[check] = {"content": cpath + check, \
                                             "metadata": mpath + check}
                else:
                    missing.append(mpath + check)

        cpath = "{}/ucontent".format(self.path)
        mpath = "{}/umetadata".format(self.path)
        if os.path.exists(cpath):
            files = self.scan(cpath, root=cpath)
            for check in files:
                if os.path.exists(mpath + check):
                    upload_map[check] = {"content": cpath + check, \
                                             "metadata": mpath + check, \
                                             "uuid": True}
                else:
                    missing.append(mpath + check)

        if missing:
            print("Missing files:", missing)
            raise RuntimeError("Mirror corrupt")

        self._upload_map(trans, upload_map)

    def _upload_directory(self, trans):
        """Internal method for uploading a directory."""

        print("Reading files from filesystem...")
        allfiles = self.scan(self.path)

        if self.regex:
            files = self.regex_filter(allfiles)
        elif self.list:
            files = self.list_filter(allfiles)
        else:
            files = allfiles

        if len(files) != len(allfiles):
            print("Selected {} of {} files from filesystem..."\
                      .format(len(files),len(allfiles)))

        upload_map = {}
        for check in files:
            upload_map[check] = {"content": self.path + check}

        self._upload_map(trans, upload_map)

    def _upload_map(self, trans, upload_map):
        """Upload from an internally constructed map."""
        print("Reading URIs from server...")
        uris = self.utils.uris(self.database)
        urihash = {}
        for uri in uris:
            urihash[uri] = 1

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(list(upload_map))
        if not stamps:
            print("No timestamps, assuming all files newer.")
        else:
            uptodate = []
            for key in upload_map:
                if key in stamps:
                    source = upload_map[key]['content']
                    statinfo = os.stat(source)
                    stamp = self._convert_timestamp(stamps[key])
                    if statinfo.st_mtime < stamp.timestamp():
                        uptodate.append(key)

            if uptodate:
                print("{} documents are up-to-date...".format(len(uptodate)))
                for key in uptodate:
                    del upload_map[key]
                    del urihash[key]    # remove it from this list so we don't delete it

        upload_count = len(upload_map)
        print("Uploading {} files...".format(upload_count))

        docs = Documents(self.connection)

        bulk = BulkLoader(self.connection)
        bulk.set_database(self.database)
        bulk.set_txid(trans.txid())

        files = list(upload_map.keys())
        done = not files
        upload_size = 0
        ulcount = 0
        while not done:
            doc = files.pop(0)
            done = not files

            docs.clear()

            source = upload_map[doc]['content']
            target = self.root + doc

            body_content_type = "application/octet-stream"

            if 'metadata' in upload_map[doc]:
                metasource = upload_map[doc]['metadata']
                metaxml = ET.parse(metasource)
                root = metaxml.getroot()

                txml = root \
                  .find("{http://marklogic.com/ns/mldbmirror/}content-type")
                if txml is not None:
                    body_content_type = txml.text
                    root.remove(txml)

                if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']:
                    txml = root \
                      .find("{http://marklogic.com/ns/mldbmirror/}uri")
                    if txml is None:
                        raise RuntimeError("No URI provided in metadata.")
                    else:
                        target = txml.text
                        root.remove(txml)

                text = ET.tostring(root, encoding="unicode", method="xml")
                docs.set_metadata(text, "application/xml")
            else:
                metasource = None
                collections = []
                permissions = []

                for cfg in self.config["config"]:
                    if type(cfg["match"]) is list:
                        matches = cfg["match"]
                    else:
                        matches = [cfg["match"]]

                    for match in matches:
                        if re.match(match, target):
                            if "content-type" in cfg:
                                body_content_type = cfg["content-type"]
                            if "permissions" in cfg:
                                permissions = cfg["permissions"]
                            if "permissions+" in cfg:
                                permissions = permissions + cfg["permissions+"]
                            if "collections" in cfg:
                                collections = cfg["collections"]
                            if "collections+" in cfg:
                                collections = collections + cfg["collections+"]

                docs.set_collections(collections)
                docs.set_permissions(None)
                for perm in permissions:
                    for key in perm:
                        docs.add_permission(key, perm[key])

            if target in urihash:
                del urihash[target]

            ulcount += 1
            statinfo = os.stat(source)
            upload_size += statinfo.st_size

            docs.set_uri(target)

            datafile = open(source, "rb")
            docs.set_content(datafile.read(), body_content_type)
            datafile.close()

            bulk.add(docs)

            if self.verbose:
                print("-> {}".format(target))

            if upload_size > self.threshold:
                perc = (float(ulcount) / upload_count) * 100.0
                print("{0:.0f}% ... {1} files, {2} bytes" \
                          .format(perc, bulk.size(), upload_size))
                if self.dryrun:
                    bulk.clear_content()
                else:
                    bulk.post()
                upload_size = 0

        if bulk.size() > 0:
            perc = (float(ulcount) / upload_count) * 100.0
            print("{0:.0f}% ... {1} files, {2} bytes" \
                      .format(perc, bulk.size(), upload_size))
            if self.dryrun:
                bulk.clear_content()
            else:
                bulk.post()

        docs.clear()
        docs.set_txid(trans.txid())
        docs.set_database(self.database)
        delcount = 0
        for uri in urihash:
            if uri.startswith(self.root):
                if self.verbose:
                    print("DEL {}".format(uri))
                docs.add_uri(uri)
                delcount += 1

        if delcount > 0:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(delcount))
            else:
                print("Deleting {} URIs...".format(delcount))
                if not self.dryrun:
                    docs.delete()

    def download(self):
        """Download data"""
        trans = Transactions(self.connection)
        if not self.dryrun:
            trans.set_database(self.database)
            trans.set_timeLimit(trans.max_timeLimit())
            trans.create()

        try:
            if self.mirror:
                self._download_mirror(trans)
            else:
                self._download_directory(trans)
        except KeyboardInterrupt:
            if not self.dryrun:
                trans.rollback()
        except:
            if not self.dryrun:
                trans.rollback()
            raise
        else:
            if not self.dryrun:
                trans.commit()

    def _download_mirror(self, trans):
        """Download mirror"""
        if not os.path.isdir(self.path):
            print("Target directory must exist: {}".format(self.path))
            sys.exit(1)

        if os.listdir(self.path):
            print("Target directory must be empty: {}".format(self.path))
            sys.exit(1)

        self.logger.debug("Starting download_mirror")

        print("Reading URIs from server...")
        alluris = self.utils.uris(self.database, self.root)

        if self.regex:
            uris = self.regex_filter(alluris, download=True)
        elif self.list:
            uris = self.list_filter(alluris, download=True)
        else:
            uris = alluris

        cdir = "{}/content".format(self.path)
        mdir = "{}/metadata".format(self.path)
        ucdir = "{}/ucontent".format(self.path)
        umdir = "{}/umetadata".format(self.path)
        down_map = {}

        for uri in uris:
            if self.can_store_on_filesystem(uri):
                down_map[uri] = {"content": cdir + uri, \
                                     "metadata": mdir + uri}
            else:
                pos = uri.rfind(".")
                if pos > 0:
                    ext = uri[pos:]
                else:
                    ext = ""

                # Put some path segments at the front so that we don't wind up
                # with a single directory containing a gazillion files
                fnuuid = str(uuid.uuid4())
                fnuuid = fnuuid[0:2] + "/" + fnuuid[2:4] + "/" + fnuuid[4:]
                filename = "/{}{}".format(fnuuid, ext)

                down_map[uri] = {"content": ucdir + filename, \
                                     "metadata": umdir + filename, \
                                     "uuid": True}

        self._download_map(trans, down_map)

    def _download_directory(self, trans):
        """Download directory"""
        if not os.path.isdir(self.path):
            print("Target directory must exist: {}".format(self.path))
            sys.exit(1)

        self.logger.debug("Starting download_directory")

        print("Reading URIs from server...")
        alluris = self.utils.uris(self.database, self.root)

        if self.regex:
            uris = self.regex_filter(alluris, download=True)
        elif self.list:
            uris = self.list_filter(alluris, download=True)
        else:
            uris = alluris

        print("Getting timestamps from server...")
        stamps = self.get_timestamps(alluris)

        down_map = {}
        skip_list = []
        for uri in uris:
            if not self.can_store_on_filesystem(uri):
                raise RuntimeError("Cannot save URI:", uri)

            localfile = self.path + uri
            skip = False
            if uri in stamps and os.path.exists(localfile):
                statinfo = os.stat(localfile)
                stamp = self._convert_timestamp(stamps[uri])
                skip = statinfo.st_mtime >= stamp.timestamp()

            if skip:
                skip_list.append(localfile)
            else:
                down_map[uri] = {"content": localfile}
                if uri in stamps:
                    down_map[uri]['timestamp'] = stamps[uri]

        if len(skip_list) > 0:
            print("Skipping {} locally up-to-date files".format(len(skip_list)))

        self._download_map(trans, down_map, skip_list)

    def _download_map(self, trans, down_map, skip_list=[]):
        """Download from an internally constructed map."""
        filehash = {}
        if not self.mirror:
            print("Reading files from filesystem...")
            for path in self.scan(self.path):
                filehash[path] = 1

        self.logger.debug("Downloading map")
        self.logger.debug(down_map)

        download_count = len(down_map)

        print("Downloading {} documents...".format(download_count))

        docs = Documents(self.connection)
        docs.set_database(self.database)
        docs.set_txid(trans.txid())
        docs.set_format('xml')
        docs.set_accept("multipart/mixed")
        if self.mirror:
            docs.set_categories(['content', 'metadata'])
        else:
            docs.set_category('content')

        dlprog = 0
        dlcount = 0
        for uri in down_map.keys():
            dlcount += 1
            docs.add_uri(uri)

            if uri in filehash:
                del filehash[uri]

            if dlcount >= self.batchsize:
                dlprog += dlcount
                perc = (float(dlprog) / download_count) * 100.0
                print("{0:.0f}% ... {1}/{2} files" \
                          .format(perc, dlprog, download_count))

                self._download_batch(docs, down_map)
                docs.clear()
                docs.set_database(self.database)
                docs.set_txid(trans.txid())
                docs.set_format('xml')
                docs.set_accept("multipart/mixed")
                if self.mirror:
                    docs.set_categories(['content', 'metadata'])
                else:
                    docs.set_category('content')
                dlcount = 0

        if dlcount > 0:
            perc = (float(dlcount) / download_count) * 100.0
            print("{0:.0f}% ... {1} files".format(perc, dlcount))
            self._download_batch(docs, down_map)

        delfiles = []
        for path in filehash.keys():
            localfile = self.path + path
            if localfile in skip_list:
                pass
            else:
                delfiles.append(localfile)

        if not self.mirror and delfiles:
            if self.regex or self.list:
                print("Limited download, not deleting {} files..." \
                          .format(len(delfiles)))
            else:
                print("Deleting {} files...".format(len(delfiles)))
                if not self.dryrun:
                    for path in delfiles:
                        os.remove(path)
                    self._remove_empty_dirs(self.path)

    def _download_batch(self, docs, down_map):
        """Download a batch of files"""
        if self.dryrun:
            return

        self.logger.debug("Downloading batch")
        self.logger.debug(docs)

        resp = docs.get()
        decoder = MultipartDecoder.from_response(resp)

        self.logger.debug("Downloaded {} bytes in {} parts" \
                              .format(len(resp.text), len(decoder.parts)))

        meta_part = None
        content_part = None
        splitregex = re.compile(';\\s*')

        if not decoder.parts:
            raise RuntimeError("FAILED TO GET ANY PARTS!?")

        for mimepart in decoder.parts:
            disp = mimepart.headers[b'Content-Disposition'].decode('utf-8')
            if 'category=metadata' in disp:
                if meta_part is not None:
                    raise RuntimeError("More than one metadata part!?")
                meta_part = mimepart
            else:
                content_part = mimepart

                disp = content_part.headers[b'Content-Disposition'].decode('utf-8')
                dispositions = splitregex.split(disp)
                filename = None
                for disp in dispositions:
                    if disp.startswith("filename="):
                        filename = disp[10:len(disp)-1]

                body_content_type = content_part.headers[b'Content-Type'].decode('utf-8')

                if filename is None:
                    raise RuntimeError("Multipart without filename!?")

                #print("FN:",filename)

                last_modified = None
                stanza = down_map[filename]
                if meta_part is not None:
                    last_modified = self._store_metadata(meta_part, down_map[filename], \
                                                             body_content_type, filename)
                    if last_modified is not None:
                        stanza['timestamp'] = last_modified
                self._store_content(content_part, stanza)
                meta_part = None
                content_part = None

    def _store_metadata(self, meta_part, stanza, body_content_type, uri):
        # fromstring() doesn't return an an xml.etree.ElementTree and
        # doesn't have a write method that we're going to need later
        fakeio = io.StringIO(meta_part.content.decode('utf-8'))
        metaxml = ET.parse(fakeio)
        root = metaxml.getroot()
        fakeio.close()

        last_mod = None
        properties = root.find('{http://marklogic.com/xdmp/property}properties')
        if properties is not None:
            last_mod = properties.find('{http://marklogic.com/xdmp/property}last-modified')
            if last_mod is not None:
                last_mod = last_mod.text

        txml = ET.Element("{http://marklogic.com/ns/mldbmirror/}content-type")
        txml.text = body_content_type
        root.insert(0, txml)

        metafn = stanza['metadata']
        if 'uuid' in stanza:
            txml = ET.Element("{http://marklogic.com/ns/mldbmirror/}uri")
            txml.text = uri
            root.insert(1, txml)

        if not self.dryrun:
            if not os.path.exists(os.path.dirname(metafn)):
                os.makedirs(os.path.dirname(metafn))

            metaxml.write(metafn)
        else:
            if self.verbose:
                print("Meta:", metafn)

        return last_mod

    def _store_content(self, content_part, stanza):
        contfn = stanza['content']

        #print(stanza)

        if 'timestamp' in stanza:
            stamp = self._convert_timestamp(stanza['timestamp'])
        else:
            stamp = None

        if not self.dryrun:
            if not os.path.exists(os.path.dirname(contfn)):
                os.makedirs(os.path.dirname(contfn))

            dataf = open(contfn, 'wb')
            dataf.write(content_part.content)
            dataf.close()
            if stamp is not None:
                os.utime(contfn, (stamp.timestamp(), stamp.timestamp()))
        else:
            if self.verbose:
                print("Data:", contfn)

    def can_store_on_filesystem(self, filename):
        """Returns true if the filename can be stored.

        Many MarkLogic URIs (e.g, /path/to/file.xml) can be stored on the filesystem.
        Many others (e.g., http://...) can not. This method returns true iff the
        filename is a legitimate filesystem name.

        N.B. If a filename does not begin with "/", it cannot be stored on the
        filesystem because if it's ever uploaded, it'll get a leading /.
        """
        if (not filename.startswith("/")) or ("//" in filename) \
          or (":" in filename) or ('"' in filename) or ('"' in filename) \
          or ("\\" in filename):
            return False
        else:
            return True

    def list_filter(self, alluris, download=False):
        if self.list:
            uris = []
            print("Reading list from {}...".format(self.list))
            listfile = open(self.list, "r")
            for line in listfile:
                line = line.rstrip()
                if line in alluris:
                    if self.dryrun and self.verbose:
                        print("INCL:", line)
                    uris.append(line)
                else:
                    print("Not available:", line)

            if download:
                cat = "files"
            else:
                cat = "URIs"

            print("File list reduced {} {} to {}." \
                      .format(len(alluris), cat, len(uris)))

            return uris
        else:
            return alluris

    def get_timestamps(self, uris):
        """Get the database timestamp for these URIs"""
        chunksize = 500
        chunkcount = math.floor((len(uris) + chunksize - 1) / chunksize)
        chunk = 0

        stamps = []
        for seg in self._chunks(uris, chunksize):
            chunk += 1
            if self.verbose:
                print("\t-> chunk {} of {} ...".format(chunk, chunkcount))
            stamps += self.utils.last_modified(self.database, seg)

        # FIXME: make the XQuery return a single object
        stamp_hash = {}
        for stamp in stamps:
            stamp_hash[stamp["uri"]] = stamp["dt"]
        return stamp_hash

    def _chunks(self, uris, n):
        """Chop a long list into a list of lists"""
        for index in range(0, len(uris), n):
            yield uris[index:index+n]

    def regex_filter(self, alluris, download=False):
        if self.regex:
            uris = []
            cregex = []
            for exp in self.regex:
                cregex.append(re.compile(exp))

            for uri in alluris:
                match = False
                for exp in cregex:
                    match = match or exp.match(uri)

                if match:
                    if self.dryrun and self.verbose:
                        print("INCL:", uri)
                    uris.append(uri)
                else:
                    if self.dryrun and self.verbose:
                        print("EXCL:", uri)

            if download:
                cat = "URIs"
            else:
                cat = "files"

            if len(self.regex) == 1:
                print("Regex filter reduced {} {} to {}." \
                          .format(len(alluris), cat, len(uris)))
            elif len(self.regex) > 1:
                print("{} regex filters reduced {} {} to {}." \
                          .format(len(self.regex), cat, len(alluris), len(uris)))

            return uris
        else:
            return alluris

    def _remove_empty_dirs(self, directory):
        """Remove empty directories recursively"""
        for name in os.listdir(directory):
            path = os.path.join(directory, name)
            if os.path.isdir(path):
                self._remove_empty_dirs(path)

        if not os.listdir(directory):
            os.rmdir(directory)

    def scan(self, directory, root=None, files=None):
        """Scan a directory recursively, returning all of the files"""
        if files is None:
            files = []
        if root is None:
            root = directory
        skip = "/" + CONFIGFILE

        for name in os.listdir(directory):
            path = os.path.join(directory, name)
            if os.path.isfile(path):
                if not path.endswith(skip):
                    path = path[len(root):]
                    files.append(path)
            else:
                self.scan(path, root, files)
        return files

    def loadconfig(self, path):
        """Load the configuration file that determines document properties"""
        config = {}
        localconfig = {}

        home = os.path.expanduser("~")
        cfgfile = home + "/" + CONFIGFILE
        if os.path.isfile(cfgfile):
            data = open(cfgfile).read()
            config = json.loads(data)

        cfgfile = path + "/" + CONFIGFILE
        if os.path.isfile(cfgfile):
            data = open(cfgfile).read()
            localconfig = json.loads(data)

        for key in localconfig:
            if key != 'config':
                config[key] = localconfig[key]

        if 'config' not in config:
            config['config'] = []

        if 'config' in localconfig:
            config['config'] = config["config"] + localconfig['config']

        self.config = config

    def _convert_timestamp(self, stamp):
        """ Convert ISO 8601 dateTime to a, uh, datetime """
        if stamp.endswith("Z"):
            stamp = stamp[0:len(stamp)-1] + "+0000"
        else:
            # Convert ...+05:00 to ...+0500
            stamp = stamp[0:len(stamp)-3] + stamp[len(stamp)-2:len(stamp)]
        stamp = datetime.strptime(stamp, "%Y-%m-%dT%H:%M:%S%z")
        return stamp