def _download_map(self, trans, down_map, skip_list=[]): """Download from an internally constructed map.""" filehash = {} if not self.mirror: print("Reading files from filesystem...") for path in self.scan(self.path): filehash[path] = 1 self.logger.debug("Downloading map") self.logger.debug(down_map) download_count = len(down_map) print("Downloading {} documents...".format(download_count)) docs = Documents(self.connection) docs.set_database(self.database) docs.set_txid(trans.txid()) docs.set_format('xml') docs.set_accept("multipart/mixed") if self.mirror: docs.set_categories(['content', 'metadata']) else: docs.set_category('content') dlprog = 0 dlcount = 0 for uri in down_map.keys(): dlcount += 1 docs.add_uri(uri) if uri in filehash: del filehash[uri] if dlcount >= self.batchsize: dlprog += dlcount perc = (float(dlprog) / download_count) * 100.0 print("{0:.0f}% ... {1}/{2} files" \ .format(perc, dlprog, download_count)) self._download_batch(docs, down_map) docs.clear() docs.set_database(self.database) docs.set_txid(trans.txid()) docs.set_format('xml') docs.set_accept("multipart/mixed") if self.mirror: docs.set_categories(['content', 'metadata']) else: docs.set_category('content') dlcount = 0 if dlcount > 0: perc = (float(dlcount) / download_count) * 100.0 print("{0:.0f}% ... {1} files".format(perc, dlcount)) self._download_batch(docs, down_map) delfiles = [] for path in filehash.keys(): localfile = self.path + path if localfile in skip_list: pass else: delfiles.append(localfile) if not self.mirror and delfiles: if self.regex or self.list: print("Limited download, not deleting {} files..." \ .format(len(delfiles))) else: print("Deleting {} files...".format(len(delfiles))) if not self.dryrun: for path in delfiles: os.remove(path) self._remove_empty_dirs(self.path)
def _upload_map(self, trans, upload_map): """Upload from an internally constructed map.""" print("Reading URIs from server...") uris = self.utils.uris(self.database) urihash = {} for uri in uris: urihash[uri] = 1 print("Getting timestamps from server...") stamps = self.get_timestamps(list(upload_map)) if not stamps: print("No timestamps, assuming all files newer.") else: uptodate = [] for key in upload_map: if key in stamps: source = upload_map[key]['content'] statinfo = os.stat(source) stamp = self._convert_timestamp(stamps[key]) if statinfo.st_mtime < stamp.timestamp(): uptodate.append(key) if uptodate: print("{} documents are up-to-date...".format(len(uptodate))) for key in uptodate: del upload_map[key] del urihash[ key] # remove it from this list so we don't delete it upload_count = len(upload_map) print("Uploading {} files...".format(upload_count)) docs = Documents(self.connection) bulk = BulkLoader(self.connection) bulk.set_database(self.database) bulk.set_txid(trans.txid()) files = list(upload_map.keys()) done = not files upload_size = 0 ulcount = 0 while not done: doc = files.pop(0) done = not files docs.clear() source = upload_map[doc]['content'] target = self.root + doc body_content_type = "application/octet-stream" if 'metadata' in upload_map[doc]: metasource = upload_map[doc]['metadata'] metaxml = ET.parse(metasource) root = metaxml.getroot() txml = root \ .find("{http://marklogic.com/ns/mldbmirror/}content-type") if txml is not None: body_content_type = txml.text root.remove(txml) if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']: txml = root \ .find("{http://marklogic.com/ns/mldbmirror/}uri") if txml is None: raise RuntimeError("No URI provided in metadata.") else: target = txml.text root.remove(txml) text = ET.tostring(root, encoding="unicode", method="xml") docs.set_metadata(text, "application/xml") else: metasource = None collections = [] permissions = [] for cfg in self.config["config"]: if type(cfg["match"]) is list: matches = cfg["match"] else: matches = [cfg["match"]] for match in matches: if re.match(match, target): if "content-type" in cfg: body_content_type = cfg["content-type"] if "permissions" in cfg: permissions = cfg["permissions"] if "permissions+" in cfg: permissions = permissions + cfg["permissions+"] if "collections" in cfg: collections = cfg["collections"] if "collections+" in cfg: collections = collections + cfg["collections+"] docs.set_collections(collections) docs.set_permissions(None) for perm in permissions: for key in perm: docs.add_permission(key, perm[key]) if target in urihash: del urihash[target] ulcount += 1 statinfo = os.stat(source) upload_size += statinfo.st_size docs.set_uri(target) datafile = open(source, "rb") docs.set_content(datafile.read(), body_content_type) datafile.close() bulk.add(docs) if self.verbose: print("-> {}".format(target)) if upload_size > self.threshold: perc = (float(ulcount) / upload_count) * 100.0 print("{0:.0f}% ... {1} files, {2} bytes" \ .format(perc, bulk.size(), upload_size)) if self.dryrun: bulk.clear_content() else: bulk.post() upload_size = 0 if bulk.size() > 0: perc = (float(ulcount) / upload_count) * 100.0 print("{0:.0f}% ... {1} files, {2} bytes" \ .format(perc, bulk.size(), upload_size)) if self.dryrun: bulk.clear_content() else: bulk.post() docs.clear() docs.set_txid(trans.txid()) docs.set_database(self.database) delcount = 0 for uri in urihash: if uri.startswith(self.root): if self.verbose: print("DEL {}".format(uri)) docs.add_uri(uri) delcount += 1 if delcount > 0: if self.regex or self.list: print("Limited download, not deleting {} files..." \ .format(delcount)) else: print("Deleting {} URIs...".format(delcount)) if not self.dryrun: docs.delete()
def _upload_map(self, trans, upload_map): """Upload from an internally constructed map.""" print("Reading URIs from server...") uris = self.utils.uris(self.database) urihash = {} for uri in uris: urihash[uri] = 1 print("Getting timestamps from server...") stamps = self.get_timestamps(list(upload_map)) if not stamps: print("No timestamps, assuming all files newer.") else: uptodate = [] for key in upload_map: if key in stamps: source = upload_map[key]['content'] statinfo = os.stat(source) stamp = self._convert_timestamp(stamps[key]) if statinfo.st_mtime < stamp.timestamp(): uptodate.append(key) if uptodate: print("{} documents are up-to-date...".format(len(uptodate))) for key in uptodate: del upload_map[key] del urihash[key] # remove it from this list so we don't delete it upload_count = len(upload_map) print("Uploading {} files...".format(upload_count)) docs = Documents(self.connection) bulk = BulkLoader(self.connection) bulk.set_database(self.database) bulk.set_txid(trans.txid()) files = list(upload_map.keys()) done = not files upload_size = 0 ulcount = 0 while not done: doc = files.pop(0) done = not files docs.clear() source = upload_map[doc]['content'] target = self.root + doc body_content_type = "application/octet-stream" if 'metadata' in upload_map[doc]: metasource = upload_map[doc]['metadata'] metaxml = ET.parse(metasource) root = metaxml.getroot() txml = root \ .find("{http://marklogic.com/ns/mldbmirror/}content-type") if txml is not None: body_content_type = txml.text root.remove(txml) if 'uuid' in upload_map[doc] and upload_map[doc]['uuid']: txml = root \ .find("{http://marklogic.com/ns/mldbmirror/}uri") if txml is None: raise RuntimeError("No URI provided in metadata.") else: target = txml.text root.remove(txml) text = ET.tostring(root, encoding="unicode", method="xml") docs.set_metadata(text, "application/xml") else: metasource = None collections = [] permissions = [] for cfg in self.config["config"]: if type(cfg["match"]) is list: matches = cfg["match"] else: matches = [cfg["match"]] for match in matches: if re.match(match, target): if "content-type" in cfg: body_content_type = cfg["content-type"] if "permissions" in cfg: permissions = cfg["permissions"] if "permissions+" in cfg: permissions = permissions + cfg["permissions+"] if "collections" in cfg: collections = cfg["collections"] if "collections+" in cfg: collections = collections + cfg["collections+"] docs.set_collections(collections) docs.set_permissions(None) for perm in permissions: for key in perm: docs.add_permission(key, perm[key]) if target in urihash: del urihash[target] ulcount += 1 statinfo = os.stat(source) upload_size += statinfo.st_size docs.set_uri(target) datafile = open(source, "rb") docs.set_content(datafile.read(), body_content_type) datafile.close() bulk.add(docs) if self.verbose: print("-> {}".format(target)) if upload_size > self.threshold: perc = (float(ulcount) / upload_count) * 100.0 print("{0:.0f}% ... {1} files, {2} bytes" \ .format(perc, bulk.size(), upload_size)) if self.dryrun: bulk.clear_content() else: bulk.post() upload_size = 0 if bulk.size() > 0: perc = (float(ulcount) / upload_count) * 100.0 print("{0:.0f}% ... {1} files, {2} bytes" \ .format(perc, bulk.size(), upload_size)) if self.dryrun: bulk.clear_content() else: bulk.post() docs.clear() docs.set_txid(trans.txid()) docs.set_database(self.database) delcount = 0 for uri in urihash: if uri.startswith(self.root): if self.verbose: print("DEL {}".format(uri)) docs.add_uri(uri) delcount += 1 if delcount > 0: if self.regex or self.list: print("Limited download, not deleting {} files..." \ .format(delcount)) else: print("Deleting {} URIs...".format(delcount)) if not self.dryrun: docs.delete()