def backend(self): if self.Backend is None or self.stale(): self.Backend = CouchBaseBackend(self.BucketName, config=self.CBConfig) #try: self.Backend = CouchBaseBackend(self.BucketName, config = self.CBConfig) #except: return None self.LastTouch = time.time() #debug("bucket for %s: %s" % (self.BucketURL, id(self.Bucket))) return self.Backend
def listDatasets(argv): from couchbase.views.iterator import View from striped.client import CouchBaseBackend from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError, NotFoundError Usage = """ python listDatasets.py -c <CouchBase config file> [-l] <bucket name> [<dataset name pattern>] """ config_file = None opts, args = getopt.getopt(argv, "c:l") opts = dict(opts) config_file = opts.get("-c") long_print = "-l" in opts if len(args) < 1: print(Usage) sys.exit(1) bucket_name = args[0] pattern = None if len(args) < 2 else args[1] backend = CouchBaseBackend(bucket_name, config=config_file) bucket = backend.bucket if long_print: print("%-30s %6s %6s %15s" % ("Dataset", "Frames", "Files", "Events")) print("%-30s %6s %6s %15s" % ("-------", "------", "-----", "------")) for ds in sorted(backend.datasets()): if long_print: nevents = 0 nrgs = 0 files = set() for rginfo in backend.RGInfos(ds): nevents += rginfo["NEvents"] nrgs += 1 for s in rginfo["Segments"]: files.add(s["FileName"]) print("%-30s %6d %6d %15d" % (ds, nrgs, len(files), nevents)) else: print(ds)
def run(self): backend = CouchBaseBackend(self.BucketName, print_errors=True, config=self.Config) frames_to_do = self.FramesToDo segment_frames = set(self.Segment.frameIDs()) print("Process %d started to load frames: %s" % (os.getpid(), sorted(list(segment_frames & frames_to_do)))) if frames_to_do: assert self.Schema is not None sl = segmentLoader(self.Segment, frames_to_do, self.DatasetName, self.DataReaderClass, self.Schema, self.Metadata, backend, self.DryRun) try: sl.run() except: print("Process %d exiting with error:" % (os.getpid(), )) traceback.print_exc() sys.exit(1) else: print("Process %d finished successfully" % (os.getpid(), ))
if len(args) < 2: print Usage sys.exit(1) root_file = args[0] tree_top = args[1] file_name = root_file.rsplit("/", 1)[-1] utree = uproot.open(root_file)[tree_top] if out_dir is None: BucketName = args[2] dataset_name = args[3] backend = CouchBaseBackend(BucketName, print_errors=True, config=config_file) else: out_dir = out_dir + "/" + file_name try: os.makedirs(out_dir) except: pass if correction_file: env = {} execfile(correction_file, env) if "DataCorrector" in env: data_corrector_class = env["DataCorrector"]
def deleteDataset(argv): from couchbase.bucket import Bucket from couchbase.views.iterator import View from couchbase.views.params import Query from couchbase.exceptions import NotFoundError from striped.client import CouchBaseBackend, CouchBaseConfig import os, sys, json, getopt, random Usage = """ python delete_dataset.py [-c <Couchbase config file>] <bucket> <dataset> """ def delete_metadata(backend, dataset): del backend["%s:@@nextRGID" % (dataset, )] keys = (k for k in backend.keys(dataset) if k.endswith(".json")) return backend.delete(keys) def delete_dataset(dataset, bucket, ratio): q = Query() q.mapkey_single = dataset v = View(bucket, "views", "keys", query=q) keys = (x.value for x in v if x.key == dataset) def pack_in_groups(keys, n, ratio): group = [] for k in keys: if ratio > random.random(): #print k if len(group) >= n: #print(group[0]) yield group group = [] group.append(k) if len(group) >= 0: yield group nremoved = 0 for kg in pack_in_groups(keys, 500, ratio): try: if kg: bucket.remove_multi(kg, quiet=True) except NotFoundError as error: print(error) else: nremoved += len(kg) if nremoved and nremoved % 10000 == 0: print(nremoved) return nremoved config = None ratio = 1.0 opts, args = getopt.getopt(argv, "c:r:m") opts = dict(opts) if "-c" in opts: config = opts["-c"] ratio = float(opts.get("-r", 1.0)) meta_only = "-m" in opts if not args: print(Usage) sys.exit(1) bucket_name = args[0] dataset_name = args[1] backend = CouchBaseBackend(bucket_name, config=config) bucket = backend.bucket n_meta = delete_metadata(backend, dataset_name) n_data = 0 if not meta_only: n_data = delete_dataset(dataset_name, bucket, ratio) print(n_meta, "metadata items removed") print(n_data, "data items removed")
def createDataset(argv): from couchbase import FMT_BYTES, FMT_JSON from couchbase.bucket import Bucket from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError import numpy as np from striped.client import CouchBaseBackend, CouchBaseConfig from striped.common import ColumnDescriptor Usage = """ python createDataset.py [-c <config file>] <schema.json> <bucket name> <dataset> """ SchemaVersion = "3.0" """ schema JSON file format { "version":"2.2", "tree_top":"....", "attributes": { "path.to.attr": { dtype:"dtype", shape:..., source:... }, ... }, "branches": { "path.to.branch": { "relative.path.to.attr": { "source":"...", "dtype":"dtype" }, ... } } } """ convert_to = {"boolean": "i1"} config = None opts, args = getopt.getopt(argv, "c:i") for opt, val in opts: if opt == "-c": config = val opts = dict(opts) config = opts.get("-c") reinit = "-i" in opts if len(args) < 3: print(Usage) sys.exit(1) schema_file, bucket_name, Dataset = args schema = json.load(open(schema_file, "r")) if not "version" in schema: schema["version"] = SchemaVersion def parseSchema(schema): return schema["attributes"], schema["branches"] fields, branches = parseSchema(schema) backend = CouchBaseBackend(bucket_name, config=config) key = "%s:@@schema.json" % (Dataset, ) backend[key].json = schema for fn, fd in fields.items(): ft = fd["dtype"] fn = str(fn) ft = str(ft) shape = fd.get("shape", []) desc = ColumnDescriptor(ft, shape, fd["source"], size_column=fn + ".@size" if (shape and shape[0] is None) else None) key = ColumnDescriptor.key(Dataset, fn) backend[key].json = desc.toDict() #print key, desc for branch, items in branches.items(): for fn, fd in items.items(): ft = fd["dtype"] path = branch + "." + fn if fn else branch desc = ColumnDescriptor(ft, fd.get("shape", []), fd["source"], depth=1, parent_array=branch, size_column=branch + ".@size") key = ColumnDescriptor.key(Dataset, path) backend[key].json = desc.toDict() next_rgid_name = "%s:@@nextRGID" % (Dataset, ) cb = backend.bucket cb.remove(next_rgid_name, quiet=True) value = backend.counter(next_rgid_name, initial=0).value print("NextRGID counter created with value", value)
def run_batch(argv): from DataReader import DataReader Usage = """ python run_batch.py [options] <batch file> <bucket name> <dataset name> Options: -c <CouchBase config file>, default - value of the COUCHBASE_BACKEND_CFG environment variable -m <max workers>, default = 5 -O - override existing frames -s <stagger>, default = 10 (seconds) -n - dry run """ opts, args = getopt.getopt(argv, "m:s:c:On") opts = dict(opts) MaxWorkers = int(opts.get("-m", 5)) Stagger = float(opts.get("-s", 1)) Config = opts.get("-c", os.environ.get("COUCHBASE_BACKEND_CFG")) Override = "-O" in opts DryRun = "-n" in opts if not Config: print( "Couchbase config file must be specified either with -c, or using COUCHBASE_BACKEND_CFG env. variable" ) print() print(Usage) sys.exit(1) if len(args) < 3: print(Usage) sys.exit(2) batch_file, bucket_name, dataset_name = args batch = Batch.load(batch_file) backend = CouchBaseBackend(bucket_name, print_errors=True, config=Config) schema = backend.schema(dataset_name) if not schema: print("Empty schema") sys.exit(1) existing_frames = set() if not Override: existing_frames = set(backend.RGIDs(dataset_name)) if existing_frames: print("The following frames exist and will not be overriden:", sorted(list(existing_frames))) task_queue = TaskQueue(MaxWorkers, stagger=Stagger) printer = Printer(batch) for i, segment in enumerate(batch): #print "segment:", i, segment, segment.frameIDs() frames = set(segment.frameIDs()) - existing_frames #print "segment:", i, segment, segment.frameIDs(), frames if frames: task = LoaderTask( printer, i, segment, Config, schema, None, # FIX ME: metadata is None for now frames, bucket_name, dataset_name, DataReader, DryRun) task_queue.addTask(task) task_queue.waitUntilEmpty()
def verifyDataset(argv): from striped.client import CouchBaseBackend import uproot Usage = """ python validateDaatset.py [-b] <dataset directory> <root tree top> <bucket> -b means check binary records only """ opts, args = getopt.getopt(argv, "b") opts = dict(opts) binary_only = "-b" in opts dataset_dir, tree_top, bucket_name = args dataset_name = dataset_dir.split("/")[-1] backend = CouchBaseBackend(bucket_name) rgids = list(backend.RGIDs(dataset_name)) print("I: %d row groups in the dataset" % (len(rgids), )) if not binary_only: files = {} # filename -> events for f in glob.glob("%s/*.root" % (dataset_dir, )): fn = f.split("/")[-1] tree = uproot.open(f)[tree_top] files[fn] = tree.numentries print("I: %d files, %d events" % (len(files), sum(files.values()))) files_in_dataset = {} rgids = set() total_events = 0 for info in backend.RGInfos(dataset_name): rgid = info["RGID"] rgids.add(rgid) nevents = info["NEvents"] total_events += nevents nevents_in_segments = 0 for s in info["Segments"]: fn = s["FileName"] ne = s["NEvents"] files_in_dataset[fn] = files_in_dataset.get(fn, 0) + ne nevents_in_segments += ne if nevents != nevents_in_segments: print( "E: Total number of events in RG #%d (%d) is not equal sum of events in segments (%d)" % (rgid, nevents, nevents_in_segments)) if len(rgids) != max(rgids) + 1: maxrgid = max(rgids) missing = [i for i in range(maxrgid + 1) if not i in rgids] print("W: gap(s) in rgids. Missing %d rgids: %s" % (max(rgids) + 1 - len(rgids), missing)) for f, n in files.items(): if not f in files_in_dataset: print("E: File %s is not in the database" % (f, )) else: n_file = files[f] n_db = files_in_dataset[f] if n_file != n_db: print( "E: Number of events in file %s (%d) differs from the database (%d)" % (f, n_file, n_db)) print("I: Scanning keys...") rgids_per_column = {} data_keys = set() nkeys = 0 for k in backend.keys(dataset_name): # parse key parts = k.split(":") if len(parts) == 3: _, column, tail = parts if not tail.startswith('@'): tail_parts = tail.split(".") rgid = int(tail_parts[0]) key_type = tail_parts[1] if rgid in rgids and key_type == 'bin': data_keys.add("%s:%d" % (column, rgid)) column_rgids = rgids_per_column.get(column) if not column_rgids: column_rgids = set() rgids_per_column[column] = column_rgids column_rgids.add(rgid) print("I: %d data keys found for %d columns" % (len(data_keys), len(rgids_per_column.keys()))) N = max([len(r) for r in rgids_per_column.values()]) print("I: max %d data keys per column" % (N, )) nmissing = 0 for cn, r in rgids_per_column.items(): n = len(r) if n != N: print("E: %d data stripes are mising for column %s" % (N - n, cn)) nmissing += N - n if nmissing: print("E: %d data stripes are missing" % (nmissing, ))
-c <couchbase config> - default environment COUCHBASE_BACKEND_CFG -i - reinitialize the object id counter so that the next object will be given oject id = 1 """ opts, args = getopt.getopt(sys.argv[1:], "n:h?c:i") opts = dict(opts) if '-h' in opts or '-?' in opts or len(args) != 3: print Usage sys.exit(1) init_oid = "-i" in opts config = opts.get("-c") group_size = int(opts.get("-n", 100000)) bucket, dataset, path = args data = fitsio.read(path) print "%d objects in the input file %s" % (len(data), path) backend = CouchBaseBackend(bucket) if init_oid: counter_key = "%s:@bliss_next_object_id" % (dataset, ) try: backend.delete([counter_key]) # remove if exists except: pass backend.counter(counter_key, initial=1) print "Counter bliss_next_object_id initialized to 1" add_objects(backend, data, dataset, group_size)
for key, array in out_observations_data.items()]) if __name__ == "__main__": import sys, time, getopt Usage = """ python add_observations.py [options] <bucket name> <dataset name> <matches_file.fits> options: -c <couchbase config> - default environment COUCHBASE_BACKEND_CFG """ opts, args = getopt.getopt(sys.argv[1:], "h?c:") opts = dict(opts) if '-h' in opts or '-?' in opts or len(args) != 3: print Usage sys.exit(1) config = opts.get("-c") bucket, dataset, path = args data = fitsio.read(path) print "%d object-observation pairs in the input file %s" % (len(data), path) backend = CouchBaseBackend(bucket) add_observations(backend, data, dataset) T.printStats()
log_file_name = val if len(args) < 3: print Usage sys.exit(1) input_file = args[0] BucketName = args[1] dataset_name = args[2] reader_params = args[3:] file_name = input_file.rsplit("/", 1)[-1] backend = CouchBaseBackend(BucketName, print_errors = True, config = config_file) try: schema = backend["%s:@@schema.json" % (dataset_name,)].json except: print "Can not get dataset schema from the database" raise data_reader = DataReader(input_file, schema, *reader_params) if profile == "file": profile = data_reader.profile() elif profile: profile = parseProfile(profile) else:
sys.exit(1) schema_file, bucket_name, Dataset = args schema = json.load(open(schema_file, "r")) if not "version" in schema: schema["version"] = SchemaVersion def parseSchema(schema): return schema["attributes"], schema["branches"] fields, branches = parseSchema(schema) backend = CouchBaseBackend(bucket_name, config=config) key = "%s:@@schema.json" % (Dataset, ) backend[key].json = schema for fn, fd in fields.items(): ft = fd["dtype"] fn = str(fn) ft = str(ft) shape = fd.get("shape", []) desc = ColumnDescriptor(ft, shape, fd["source"], size_column=fn + ".@size" if (shape and shape[0] is None) else None) key = ColumnDescriptor.key(Dataset, fn)
def listDataset(argv): from couchbase.views.iterator import View from couchbase.views.params import Query from striped.client import CouchBaseBackend from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError, NotFoundError Usage = """ python listDataset.py -c <CouchBase config file> [-f|-l] <bucket name> <dataset name> """ config_file = None opts, args = getopt.getopt(argv, "c:lfn") opts = dict(opts) config_file = opts.get("-c") files_only = "-f" in opts long_print = "-l" in opts counter = "-n" in opts if len(sys.argv) < 2: print(Usage) sys.exit(1) bucket_name, dataset_name = args backend = CouchBaseBackend(bucket_name, config=config_file) bucket = backend.bucket if False: q = Query() q.mapkey_single = dataset_name v = View(bucket, "views", "RGInfos", query=q) infos = [x.value for x in v if x.key == dataset_name] infos = backend.RGInfos(dataset_name) infos = sorted(infos, key = lambda info: info["RGID"]) if long_print: print("RGID NEvents File(s)") print("------- ---------- -------") nevents = 0 files = {} rgids = set() for info in infos: fn = info["Segments"][0]["FileName"] print("%7d %10d %s" % (info["RGID"], info["NEvents"], fn)) rgids.add(info["RGID"]) files[fn] = 1 for s in info["Segments"][1:]: print("%19s %s" % (" ", s["FileName"])) files[s["FileName"]] = 1 nevents += info["NEvents"] print("------- ---------- -------") print("%7d %10d %d" % (len(infos), nevents, len(files))) maxrgid = max(rgids) if len(rgids) != maxrgid+1: print("Missing RGIDs (%d):" % (maxrgid+1 - len(rgids),)) for rgid in range(maxrgid): if not rgid in rgids: print(rgid, end=' ') print() elif files_only: files = {} # filename -> nevents for info in infos: for s in info["Segments"]: fn = s["FileName"] files[fn] = files.get(fn, 0) + s["NEvents"] for fn in sorted(files.keys()): print(fn) else: files = set() rgids = set() nevents = 0 try: counter = backend.counter("%s:@@nextRGID" % (dataset_name,), delta=0).value except NotFoundError: counter = None for info in infos: rgids.add(info["RGID"]) for s in info["Segments"]: files.add(s["FileName"]) nevents += info["NEvents"] print("Next FrameID: ", counter) print("Files: ", len(files)) print("Frames: ", len(rgids)) print("Events: ", nevents) if len(rgids): print("Max farme id: ", max(rgids)) print("Events/frame: ", int(float(nevents)/float(len(rgids))+0.5)) maxrgid = max(rgids) if len(rgids) < maxrgid + 1: print("Missing RGIDs (%d):" % (maxrgid+1 - len(rgids),)) for rgid in range(maxrgid): if not rgid in rgids: print(rgid, end=' ') print()
def create_batch(argv): from DataReader import DataReader Usage = """ python create_batch.py [options] <batch_file> <bucket name> <dataset name> @<file with input file list> python create_batch.py [options] <batch_file> <bucket name> <dataset name> <directory path> python create_batch.py [options] <batch_file> <bucket name> <dataset name> <file> <file> ... Options: -O (reuse|REUSE|allocate) - override existing batch description file and either reuse same frame id range or allocate new range -c <couchbase config file> -n <target frame size>, default = 10000 -p <path prefix> - prefix to add to the file paths read from the file or given as the list -k <n> - how many lowest path components, in addition to the file name to keep in the provenance data, defailt=0, i.e. keep the file name only -x <extension> - if the input is specified as a directory, then this is the extension of data files under the directory. Default = "root" -m <n readers> - run multiple data reader threads in parallel, default = 1 -q - be quiet """ opts, args = getopt.getopt(argv, "n:p:k:x:O:qc:m:") opts = dict(opts) Config = opts.get("-c") FrameSize = int(opts.get("-n", 10000)) Prefix = opts.get("-p") Keep = int(opts.get("-k", 0)) Extension = opts.get("-x", "root") Override = "-O" in opts OverrideMode = opts.get("-O") MaxReaders = int(opts.get("-m", 1)) Quiet = "-q" in opts if len(args) < 4 or not OverrideMode in (None, "reuse", "allocate", "REUSE"): print(Usage) sys.exit(1) BatchFile, BucketName, DatasetName = args[:3] exists = False try: exists = os.path.isfile(BatchFile) except: pass old_batch = None if exists: if not Override: print() print( "ERROR: File %s exists. Use -O (reuse|allocate) to override." % (BatchFile, )) print("Old file left unchanged.") print() print(Usage) sys.exit(1) old_batch = Batch.load(BatchFile) #print len(old_batch) source = args[3] if source[0] == '@': paths = [ f for f in [ff.strip() for ff in open(source[1:], "r").readlines()] if f ] elif stat.S_ISDIR(os.stat(source).st_mode): assert not Prefix, "\nERROR: Can not use path prefix with the input specified as the directory\n" if Extension[0] == '.': Extension = Extension[1:] paths = sorted(glob.glob( "%s/*.%s" % (source, Extension))) # local directory - assume root files else: paths = args[3:] # explicit file path list if Prefix: paths = [Prefix + f for f in paths] provenance_names = [] for fp in paths: parts = fp.split("/") provenance_names.append("/".join(parts[-1 - Keep:])) if not Quiet: print("Building frame map from %d files..." % (len(paths, ))) backend = CouchBaseBackend(BucketName, print_errors=True, config=Config) schema = backend.schema(DatasetName) if not schema: print("Dataset %s not found" % (DatasetName, )) sys.exit(1) class FileCounter(object): def __init__(self, ntotal, show_tqdm): self.T = None if not (use_tqdm and show_tqdm) else tqdm( total=ntotal) def tick(self, n=1): if self.T is not None: self.T.update(n) file_counter = FileCounter(len(paths), not Quiet) file_infos = [ FileInfo(path, None, prov) for path, prov in zip(paths, provenance_names) ] queue = TaskQueue(MaxReaders, tasks=[ GetNEventsTask(fi, schema, file_counter.tick) for fi in file_infos ]) queue.waitUntilEmpty() batch = Batch().build(DataReader, schema, FrameSize, file_infos) NFrames = len(batch) if not Quiet: print("Frame map with %d frames generated" % (NFrames, )) start_farme_id = None if old_batch is not None: nold = len(old_batch) if OverrideMode.lower() == "reuse": if nold < NFrames and OverrideMode != "REUSE": print() print( "ERROR: Can not reuse old frame id range because old range (%d) is shorter than needed (%d)" % (nold, NFrames)) print(" Use -O REUSE (capitals) to override") print() sys.exit(1) if nold > NFrames: print() print( "WARNING: old frame id range (%d) is larger than new one (%d)" % (nold, NFrames)) print() start_farme_id = old_batch.StartFrameID if not Quiet: print("Frame ID range starting at %d will be reused" % (start_farme_id, )) if start_farme_id is None: start_farme_id = backend.allocateRGIDs(DatasetName, NFrames) if not Quiet: print("Frame ID range is allocated starting at %d" % (start_farme_id, )) batch.setStartFrameID(start_farme_id) batch.save(BatchFile) if not Quiet: print("Batch saved to file: %s" % (BatchFile, ))
if source: arr = np.asarray(data[source], dtype=desc["dtype"]).copy() header = "#__header:version=%s;dtype=%s#" % (StripeHeaderFormatVersion, arr.dtype.str) arrays[key] = bytes(header) + bytes(arr.data) backend.put_data(arrays) rginfo = RGInfo(rgid, ProvenanceSegment(filename.rsplit("/",1)[-1], 0, len(data))) key = "%s:@@rginfo:%s.json" % (dataset, rgid) backend[key].json = rginfo.toDict() print "File %s ingested with %d objects, hpix range: %d %d" % (filename, len(data), data["HPIX"][0], data["HPIX"][-1]) opts, args = getopt.getopt(sys.argv[1:], "") if not args: print Usage sys.exit(1) schema = json.load(open(args[0], "r")) bucket_name = args[1] dataset = args[2] files = args[3:] config_file = None backend = CouchBaseBackend(bucket_name, print_errors = True, config = config_file) for filename in files: ingest_file(backend, schema, filename, dataset)