Ejemplo n.º 1
0
 def backend(self):
     if self.Backend is None or self.stale():
         self.Backend = CouchBaseBackend(self.BucketName,
                                         config=self.CBConfig)
         #try: self.Backend = CouchBaseBackend(self.BucketName, config = self.CBConfig)
         #except: return None
     self.LastTouch = time.time()
     #debug("bucket for %s: %s" % (self.BucketURL, id(self.Bucket)))
     return self.Backend
Ejemplo n.º 2
0
def listDatasets(argv):
    from couchbase.views.iterator import View
    from striped.client import CouchBaseBackend

    from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError, NotFoundError

    Usage = """
    python listDatasets.py -c <CouchBase config file> [-l] <bucket name> [<dataset name pattern>]
    """

    config_file = None

    opts, args = getopt.getopt(argv, "c:l")
    opts = dict(opts)
    config_file = opts.get("-c")
    long_print = "-l" in opts

    if len(args) < 1:
        print(Usage)
        sys.exit(1)

    bucket_name = args[0]
    pattern = None if len(args) < 2 else args[1]
    backend = CouchBaseBackend(bucket_name, config=config_file)
    bucket = backend.bucket

    if long_print:
        print("%-30s %6s %6s %15s" % ("Dataset", "Frames", "Files", "Events"))
        print("%-30s %6s %6s %15s" % ("-------", "------", "-----", "------"))

    for ds in sorted(backend.datasets()):
        if long_print:
            nevents = 0
            nrgs = 0
            files = set()
            for rginfo in backend.RGInfos(ds):
                nevents += rginfo["NEvents"]
                nrgs += 1
                for s in rginfo["Segments"]:
                    files.add(s["FileName"])
            print("%-30s %6d %6d %15d" % (ds, nrgs, len(files), nevents))
        else:
            print(ds)
Ejemplo n.º 3
0
 def run(self):
     backend = CouchBaseBackend(self.BucketName,
                                print_errors=True,
                                config=self.Config)
     frames_to_do = self.FramesToDo
     segment_frames = set(self.Segment.frameIDs())
     print("Process %d started to load frames: %s" %
           (os.getpid(), sorted(list(segment_frames & frames_to_do))))
     if frames_to_do:
         assert self.Schema is not None
         sl = segmentLoader(self.Segment, frames_to_do, self.DatasetName,
                            self.DataReaderClass, self.Schema,
                            self.Metadata, backend, self.DryRun)
         try:
             sl.run()
         except:
             print("Process %d exiting with error:" % (os.getpid(), ))
             traceback.print_exc()
             sys.exit(1)
         else:
             print("Process %d finished successfully" % (os.getpid(), ))
Ejemplo n.º 4
0
if len(args) < 2:
    print Usage
    sys.exit(1)

root_file = args[0]
tree_top = args[1]

file_name = root_file.rsplit("/", 1)[-1]

utree = uproot.open(root_file)[tree_top]

if out_dir is None:
    BucketName = args[2]
    dataset_name = args[3]
    backend = CouchBaseBackend(BucketName,
                               print_errors=True,
                               config=config_file)

else:
    out_dir = out_dir + "/" + file_name
    try:
        os.makedirs(out_dir)
    except:
        pass

if correction_file:
    env = {}
    execfile(correction_file, env)
    if "DataCorrector" in env:
        data_corrector_class = env["DataCorrector"]
Ejemplo n.º 5
0
def deleteDataset(argv):
    from couchbase.bucket import Bucket
    from couchbase.views.iterator import View
    from couchbase.views.params import Query
    from couchbase.exceptions import NotFoundError
    from striped.client import CouchBaseBackend, CouchBaseConfig
    import os, sys, json, getopt, random

    Usage = """
    python delete_dataset.py [-c <Couchbase config file>] <bucket> <dataset> 
    """

    def delete_metadata(backend, dataset):
        del backend["%s:@@nextRGID" % (dataset, )]
        keys = (k for k in backend.keys(dataset) if k.endswith(".json"))
        return backend.delete(keys)

    def delete_dataset(dataset, bucket, ratio):
        q = Query()
        q.mapkey_single = dataset
        v = View(bucket, "views", "keys", query=q)
        keys = (x.value for x in v if x.key == dataset)

        def pack_in_groups(keys, n, ratio):
            group = []
            for k in keys:
                if ratio > random.random():
                    #print k
                    if len(group) >= n:
                        #print(group[0])
                        yield group
                        group = []
                    group.append(k)
            if len(group) >= 0:
                yield group

        nremoved = 0

        for kg in pack_in_groups(keys, 500, ratio):
            try:
                if kg:
                    bucket.remove_multi(kg, quiet=True)
            except NotFoundError as error:
                print(error)
            else:
                nremoved += len(kg)
            if nremoved and nremoved % 10000 == 0:
                print(nremoved)

        return nremoved

    config = None
    ratio = 1.0

    opts, args = getopt.getopt(argv, "c:r:m")
    opts = dict(opts)
    if "-c" in opts:
        config = opts["-c"]

    ratio = float(opts.get("-r", 1.0))
    meta_only = "-m" in opts

    if not args:
        print(Usage)
        sys.exit(1)

    bucket_name = args[0]
    dataset_name = args[1]

    backend = CouchBaseBackend(bucket_name, config=config)
    bucket = backend.bucket
    n_meta = delete_metadata(backend, dataset_name)
    n_data = 0
    if not meta_only:
        n_data = delete_dataset(dataset_name, bucket, ratio)
    print(n_meta, "metadata items removed")
    print(n_data, "data items removed")
Ejemplo n.º 6
0
def createDataset(argv):

    from couchbase import FMT_BYTES, FMT_JSON
    from couchbase.bucket import Bucket
    from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError
    import numpy as np
    from striped.client import CouchBaseBackend, CouchBaseConfig
    from striped.common import ColumnDescriptor

    Usage = """
    python createDataset.py [-c <config file>] <schema.json> <bucket name> <dataset>
    """

    SchemaVersion = "3.0"
    """ schema JSON file format

    {
        "version":"2.2",
        "tree_top":"....",
        "attributes":
        {
            "path.to.attr": {
                dtype:"dtype",
                shape:...,
                source:...
                },
            ...
        },
        "branches":
        {
            "path.to.branch":
            {
                "relative.path.to.attr": {
                    "source":"...",
                    "dtype":"dtype"
                },
                ...
            }
        }
    }
    """

    convert_to = {"boolean": "i1"}

    config = None

    opts, args = getopt.getopt(argv, "c:i")
    for opt, val in opts:
        if opt == "-c": config = val

    opts = dict(opts)
    config = opts.get("-c")
    reinit = "-i" in opts

    if len(args) < 3:
        print(Usage)
        sys.exit(1)

    schema_file, bucket_name, Dataset = args

    schema = json.load(open(schema_file, "r"))
    if not "version" in schema:
        schema["version"] = SchemaVersion

    def parseSchema(schema):
        return schema["attributes"], schema["branches"]

    fields, branches = parseSchema(schema)

    backend = CouchBaseBackend(bucket_name, config=config)

    key = "%s:@@schema.json" % (Dataset, )
    backend[key].json = schema

    for fn, fd in fields.items():
        ft = fd["dtype"]
        fn = str(fn)
        ft = str(ft)
        shape = fd.get("shape", [])
        desc = ColumnDescriptor(ft,
                                shape,
                                fd["source"],
                                size_column=fn + ".@size" if
                                (shape and shape[0] is None) else None)
        key = ColumnDescriptor.key(Dataset, fn)
        backend[key].json = desc.toDict()
        #print key, desc

    for branch, items in branches.items():
        for fn, fd in items.items():
            ft = fd["dtype"]
            path = branch + "." + fn if fn else branch
            desc = ColumnDescriptor(ft,
                                    fd.get("shape", []),
                                    fd["source"],
                                    depth=1,
                                    parent_array=branch,
                                    size_column=branch + ".@size")

            key = ColumnDescriptor.key(Dataset, path)
            backend[key].json = desc.toDict()

    next_rgid_name = "%s:@@nextRGID" % (Dataset, )
    cb = backend.bucket
    cb.remove(next_rgid_name, quiet=True)
    value = backend.counter(next_rgid_name, initial=0).value
    print("NextRGID counter created with value", value)
Ejemplo n.º 7
0
def run_batch(argv):
    from DataReader import DataReader

    Usage = """
    python run_batch.py [options] <batch file> <bucket name> <dataset name>

    Options:
        -c <CouchBase config file>, default - value of the COUCHBASE_BACKEND_CFG environment variable
        -m <max workers>, default = 5
        -O - override existing frames
        -s <stagger>, default = 10 (seconds)
        -n - dry run
    """

    opts, args = getopt.getopt(argv, "m:s:c:On")
    opts = dict(opts)
    MaxWorkers = int(opts.get("-m", 5))
    Stagger = float(opts.get("-s", 1))
    Config = opts.get("-c", os.environ.get("COUCHBASE_BACKEND_CFG"))
    Override = "-O" in opts
    DryRun = "-n" in opts

    if not Config:
        print(
            "Couchbase config file must be specified either with -c, or using COUCHBASE_BACKEND_CFG env. variable"
        )
        print()
        print(Usage)
        sys.exit(1)

    if len(args) < 3:
        print(Usage)
        sys.exit(2)

    batch_file, bucket_name, dataset_name = args

    batch = Batch.load(batch_file)
    backend = CouchBaseBackend(bucket_name, print_errors=True, config=Config)
    schema = backend.schema(dataset_name)

    if not schema:
        print("Empty schema")
        sys.exit(1)

    existing_frames = set()

    if not Override:
        existing_frames = set(backend.RGIDs(dataset_name))

    if existing_frames:
        print("The following frames exist and will not be overriden:",
              sorted(list(existing_frames)))

    task_queue = TaskQueue(MaxWorkers, stagger=Stagger)
    printer = Printer(batch)
    for i, segment in enumerate(batch):
        #print "segment:", i, segment, segment.frameIDs()
        frames = set(segment.frameIDs()) - existing_frames
        #print "segment:", i, segment, segment.frameIDs(), frames
        if frames:
            task = LoaderTask(
                printer,
                i,
                segment,
                Config,
                schema,
                None,  # FIX ME: metadata is None for now
                frames,
                bucket_name,
                dataset_name,
                DataReader,
                DryRun)
            task_queue.addTask(task)
    task_queue.waitUntilEmpty()
Ejemplo n.º 8
0
def verifyDataset(argv):
    from striped.client import CouchBaseBackend
    import uproot

    Usage = """
    python validateDaatset.py [-b] <dataset directory> <root tree top> <bucket>

     -b means check binary records only

    """

    opts, args = getopt.getopt(argv, "b")
    opts = dict(opts)
    binary_only = "-b" in opts
    dataset_dir, tree_top, bucket_name = args

    dataset_name = dataset_dir.split("/")[-1]
    backend = CouchBaseBackend(bucket_name)

    rgids = list(backend.RGIDs(dataset_name))
    print("I: %d row groups in the dataset" % (len(rgids), ))

    if not binary_only:

        files = {}  # filename -> events

        for f in glob.glob("%s/*.root" % (dataset_dir, )):
            fn = f.split("/")[-1]
            tree = uproot.open(f)[tree_top]
            files[fn] = tree.numentries

        print("I: %d files, %d events" % (len(files), sum(files.values())))

        files_in_dataset = {}
        rgids = set()
        total_events = 0

        for info in backend.RGInfos(dataset_name):
            rgid = info["RGID"]
            rgids.add(rgid)
            nevents = info["NEvents"]
            total_events += nevents
            nevents_in_segments = 0
            for s in info["Segments"]:
                fn = s["FileName"]
                ne = s["NEvents"]
                files_in_dataset[fn] = files_in_dataset.get(fn, 0) + ne
                nevents_in_segments += ne
            if nevents != nevents_in_segments:
                print(
                    "E: Total number of events in RG #%d (%d) is not equal sum of events in segments (%d)"
                    % (rgid, nevents, nevents_in_segments))

        if len(rgids) != max(rgids) + 1:
            maxrgid = max(rgids)
            missing = [i for i in range(maxrgid + 1) if not i in rgids]
            print("W: gap(s) in rgids. Missing %d rgids: %s" %
                  (max(rgids) + 1 - len(rgids), missing))

        for f, n in files.items():
            if not f in files_in_dataset:
                print("E: File %s is not in the database" % (f, ))
            else:
                n_file = files[f]
                n_db = files_in_dataset[f]
                if n_file != n_db:
                    print(
                        "E: Number of events in file %s (%d) differs from the database (%d)"
                        % (f, n_file, n_db))

    print("I: Scanning keys...")

    rgids_per_column = {}
    data_keys = set()

    nkeys = 0

    for k in backend.keys(dataset_name):
        # parse key
        parts = k.split(":")
        if len(parts) == 3:
            _, column, tail = parts
            if not tail.startswith('@'):
                tail_parts = tail.split(".")
                rgid = int(tail_parts[0])
                key_type = tail_parts[1]

                if rgid in rgids and key_type == 'bin':
                    data_keys.add("%s:%d" % (column, rgid))
                    column_rgids = rgids_per_column.get(column)
                    if not column_rgids:
                        column_rgids = set()
                        rgids_per_column[column] = column_rgids
                    column_rgids.add(rgid)

    print("I: %d data keys found for %d columns" %
          (len(data_keys), len(rgids_per_column.keys())))

    N = max([len(r) for r in rgids_per_column.values()])

    print("I: max %d data keys per column" % (N, ))

    nmissing = 0

    for cn, r in rgids_per_column.items():
        n = len(r)
        if n != N:
            print("E: %d data stripes are mising for column %s" % (N - n, cn))
            nmissing += N - n

    if nmissing:
        print("E: %d data stripes are missing" % (nmissing, ))
Ejemplo n.º 9
0
        -c <couchbase config>   - default environment COUCHBASE_BACKEND_CFG 
        -i - reinitialize the object id counter so that the next object will be given oject id = 1
    """

    opts, args = getopt.getopt(sys.argv[1:], "n:h?c:i")
    opts = dict(opts)
    if '-h' in opts or '-?' in opts or len(args) != 3:
        print Usage
        sys.exit(1)

    init_oid = "-i" in opts
    config = opts.get("-c")
    group_size = int(opts.get("-n", 100000))

    bucket, dataset, path = args

    data = fitsio.read(path)
    print "%d objects in the input file %s" % (len(data), path)
    backend = CouchBaseBackend(bucket)

    if init_oid:
        counter_key = "%s:@bliss_next_object_id" % (dataset, )
        try:
            backend.delete([counter_key])  # remove if exists
        except:
            pass
        backend.counter(counter_key, initial=1)
        print "Counter bliss_next_object_id initialized to 1"

    add_objects(backend, data, dataset, group_size)
Ejemplo n.º 10
0
                     for key, array in out_observations_data.items()])


if __name__ == "__main__":
    import sys, time, getopt

    Usage = """
    python add_observations.py [options] <bucket name> <dataset name> <matches_file.fits>
    options:
        -c <couchbase config>   - default environment COUCHBASE_BACKEND_CFG 
    """

    opts, args = getopt.getopt(sys.argv[1:], "h?c:")
    opts = dict(opts)
    if '-h' in opts or '-?' in opts or len(args) != 3:
        print Usage
        sys.exit(1)

    config = opts.get("-c")

    bucket, dataset, path = args

    data = fitsio.read(path)
    print "%d object-observation pairs in the input file %s" % (len(data),
                                                                path)
    backend = CouchBaseBackend(bucket)

    add_observations(backend, data, dataset)

    T.printStats()
Ejemplo n.º 11
0
        log_file_name = val


if len(args) < 3:
    print Usage
    sys.exit(1)


input_file = args[0]
BucketName = args[1]
dataset_name = args[2]
reader_params = args[3:]

file_name = input_file.rsplit("/", 1)[-1]

backend = CouchBaseBackend(BucketName, print_errors = True, config = config_file)
    
try:
    schema = backend["%s:@@schema.json" % (dataset_name,)].json
except:
    print "Can not get dataset schema from the database"
    raise


data_reader = DataReader(input_file, schema, *reader_params)

if profile == "file":
    profile = data_reader.profile()
elif profile:
    profile = parseProfile(profile)
else:
Ejemplo n.º 12
0
    sys.exit(1)

schema_file, bucket_name, Dataset = args

schema = json.load(open(schema_file, "r"))
if not "version" in schema:
    schema["version"] = SchemaVersion


def parseSchema(schema):
    return schema["attributes"], schema["branches"]


fields, branches = parseSchema(schema)

backend = CouchBaseBackend(bucket_name, config=config)

key = "%s:@@schema.json" % (Dataset, )
backend[key].json = schema

for fn, fd in fields.items():
    ft = fd["dtype"]
    fn = str(fn)
    ft = str(ft)
    shape = fd.get("shape", [])
    desc = ColumnDescriptor(ft,
                            shape,
                            fd["source"],
                            size_column=fn + ".@size" if
                            (shape and shape[0] is None) else None)
    key = ColumnDescriptor.key(Dataset, fn)
Ejemplo n.º 13
0
def listDataset(argv):
    from couchbase.views.iterator import View
    from couchbase.views.params import Query
    from striped.client import CouchBaseBackend

    from couchbase.exceptions import KeyExistsError, TemporaryFailError, TimeoutError, NotFoundError

    Usage = """
    python listDataset.py -c <CouchBase config file> [-f|-l] <bucket name> <dataset name>
    """

    config_file = None

    opts, args = getopt.getopt(argv, "c:lfn")
    opts = dict(opts)
    config_file = opts.get("-c")
    files_only = "-f" in opts
    long_print = "-l" in opts
    counter = "-n" in opts

    if len(sys.argv) < 2:
        print(Usage)
        sys.exit(1)

    bucket_name, dataset_name = args
    backend = CouchBaseBackend(bucket_name, config=config_file)
    bucket = backend.bucket

    if False:
            q = Query()
            q.mapkey_single = dataset_name
            v = View(bucket, "views", "RGInfos", query=q)
            infos = [x.value for x in v if x.key == dataset_name]
    infos = backend.RGInfos(dataset_name)
    infos = sorted(infos, key = lambda info: info["RGID"])


    if long_print:
            print("RGID    NEvents    File(s)")
            print("------- ---------- -------")

            nevents = 0

            files = {}
            rgids = set()


            for info in infos:
                fn = info["Segments"][0]["FileName"]
                print("%7d %10d %s" % (info["RGID"], info["NEvents"], fn))
                rgids.add(info["RGID"])
                files[fn] = 1
                for s in info["Segments"][1:]:
                    print("%19s %s" % (" ", s["FileName"]))
                    files[s["FileName"]] = 1
                nevents += info["NEvents"]

            print("------- ---------- -------")
            print("%7d %10d %d" % (len(infos), nevents, len(files)))   

            maxrgid = max(rgids)
            if len(rgids) != maxrgid+1:
                    print("Missing RGIDs (%d):" % (maxrgid+1 - len(rgids),))
                    for rgid in range(maxrgid):
                            if not rgid in rgids:
                                    print(rgid, end=' ')
                    print()
    elif files_only:
            files = {}          # filename -> nevents
            for info in infos:
                for s in info["Segments"]:
                    fn = s["FileName"]
                    files[fn] = files.get(fn, 0) + s["NEvents"]
            for fn in sorted(files.keys()):
                    print(fn)

    else:
            files = set()
            rgids = set()
            nevents = 0

            try:        
                counter =  backend.counter("%s:@@nextRGID" % (dataset_name,), delta=0).value
            except NotFoundError:
                    counter = None

            for info in infos:
                rgids.add(info["RGID"])
                for s in info["Segments"]:
                    files.add(s["FileName"])
                nevents += info["NEvents"]
            print("Next FrameID:      ", counter)
            print("Files:             ", len(files))
            print("Frames:            ", len(rgids))
            print("Events:            ", nevents)
            if len(rgids):
                print("Max farme id:      ", max(rgids))
                print("Events/frame:      ", int(float(nevents)/float(len(rgids))+0.5))

                maxrgid = max(rgids)
                if len(rgids) < maxrgid + 1:
                        print("Missing RGIDs (%d):" % (maxrgid+1 - len(rgids),))
                        for rgid in range(maxrgid):
                                if not rgid in rgids:
                                        print(rgid, end=' ')
                        print()
Ejemplo n.º 14
0
def create_batch(argv):
    from DataReader import DataReader

    Usage = """
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> @<file with input file list>
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> <directory path>
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> <file> <file> ...

    Options:
        -O (reuse|REUSE|allocate) - override existing batch description file and either 
                              reuse same frame id range or 
                              allocate new range
        -c <couchbase config file>
        -n <target frame size>, default = 10000
        -p <path prefix> - prefix to add to the file paths read from the file or given as the list
        -k <n> - how many lowest path components, in addition to the file name 
                 to keep in the provenance data, defailt=0, i.e. keep the file name only
        -x <extension> - if the input is specified as a directory, then this is the extension of data files
                         under the directory. Default = "root"
        -m <n readers> - run multiple data reader threads in parallel, default = 1
        -q - be quiet
    """

    opts, args = getopt.getopt(argv, "n:p:k:x:O:qc:m:")
    opts = dict(opts)
    Config = opts.get("-c")
    FrameSize = int(opts.get("-n", 10000))
    Prefix = opts.get("-p")
    Keep = int(opts.get("-k", 0))
    Extension = opts.get("-x", "root")
    Override = "-O" in opts
    OverrideMode = opts.get("-O")
    MaxReaders = int(opts.get("-m", 1))

    Quiet = "-q" in opts

    if len(args) < 4 or not OverrideMode in (None, "reuse", "allocate",
                                             "REUSE"):
        print(Usage)
        sys.exit(1)

    BatchFile, BucketName, DatasetName = args[:3]

    exists = False
    try:
        exists = os.path.isfile(BatchFile)
    except:
        pass

    old_batch = None

    if exists:

        if not Override:
            print()
            print(
                "ERROR: File %s exists. Use -O (reuse|allocate) to override." %
                (BatchFile, ))
            print("Old file left unchanged.")
            print()
            print(Usage)
            sys.exit(1)

        old_batch = Batch.load(BatchFile)
        #print len(old_batch)

    source = args[3]
    if source[0] == '@':
        paths = [
            f
            for f in [ff.strip() for ff in open(source[1:], "r").readlines()]
            if f
        ]
    elif stat.S_ISDIR(os.stat(source).st_mode):
        assert not Prefix, "\nERROR: Can not use path prefix with the input specified as the directory\n"
        if Extension[0] == '.':
            Extension = Extension[1:]
        paths = sorted(glob.glob(
            "%s/*.%s" %
            (source, Extension)))  # local directory - assume root files
    else:
        paths = args[3:]  # explicit file path list

    if Prefix: paths = [Prefix + f for f in paths]

    provenance_names = []
    for fp in paths:
        parts = fp.split("/")
        provenance_names.append("/".join(parts[-1 - Keep:]))

    if not Quiet:
        print("Building frame map from %d files..." % (len(paths, )))

    backend = CouchBaseBackend(BucketName, print_errors=True, config=Config)
    schema = backend.schema(DatasetName)

    if not schema:
        print("Dataset %s not found" % (DatasetName, ))
        sys.exit(1)

    class FileCounter(object):
        def __init__(self, ntotal, show_tqdm):
            self.T = None if not (use_tqdm and show_tqdm) else tqdm(
                total=ntotal)

        def tick(self, n=1):
            if self.T is not None:
                self.T.update(n)

    file_counter = FileCounter(len(paths), not Quiet)
    file_infos = [
        FileInfo(path, None, prov)
        for path, prov in zip(paths, provenance_names)
    ]
    queue = TaskQueue(MaxReaders,
                      tasks=[
                          GetNEventsTask(fi, schema, file_counter.tick)
                          for fi in file_infos
                      ])
    queue.waitUntilEmpty()

    batch = Batch().build(DataReader, schema, FrameSize, file_infos)

    NFrames = len(batch)

    if not Quiet:
        print("Frame map with %d frames generated" % (NFrames, ))
    start_farme_id = None
    if old_batch is not None:
        nold = len(old_batch)
        if OverrideMode.lower() == "reuse":
            if nold < NFrames and OverrideMode != "REUSE":
                print()
                print(
                    "ERROR: Can not reuse old frame id range because old range (%d) is shorter than needed (%d)"
                    % (nold, NFrames))
                print("       Use -O REUSE (capitals) to override")
                print()
                sys.exit(1)
            if nold > NFrames:
                print()
                print(
                    "WARNING: old frame id range (%d) is larger than new one (%d)"
                    % (nold, NFrames))
                print()
            start_farme_id = old_batch.StartFrameID
            if not Quiet:
                print("Frame ID range starting at %d will be reused" %
                      (start_farme_id, ))

    if start_farme_id is None:
        start_farme_id = backend.allocateRGIDs(DatasetName, NFrames)
        if not Quiet:
            print("Frame ID range is allocated starting at %d" %
                  (start_farme_id, ))

    batch.setStartFrameID(start_farme_id)

    batch.save(BatchFile)

    if not Quiet: print("Batch saved to file: %s" % (BatchFile, ))
Ejemplo n.º 15
0
        if source:
            arr = np.asarray(data[source], dtype=desc["dtype"]).copy()
            header = "#__header:version=%s;dtype=%s#" % (StripeHeaderFormatVersion, arr.dtype.str)
            arrays[key] = bytes(header) + bytes(arr.data) 
    backend.put_data(arrays)
    
    rginfo = RGInfo(rgid, ProvenanceSegment(filename.rsplit("/",1)[-1], 0, len(data)))
    key = "%s:@@rginfo:%s.json" % (dataset, rgid)
    backend[key].json = rginfo.toDict()
    
    print "File %s ingested with %d objects, hpix range: %d %d" % (filename, len(data), data["HPIX"][0], data["HPIX"][-1])
            
        

opts, args = getopt.getopt(sys.argv[1:], "")

if not args:
    print Usage
    sys.exit(1)

schema = json.load(open(args[0], "r"))
bucket_name = args[1]
dataset = args[2]
files = args[3:]
config_file = None

backend = CouchBaseBackend(bucket_name, print_errors = True, config = config_file)

for filename in files:
    ingest_file(backend, schema, filename, dataset)