def _get_dataset_partition_record(did, pid): from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber don = ObjectNumber.parse(did) if not don or not isinstance(don, DatasetNumber): raise exc.BadRequest('Dataset number {} is not valid'.format(did)) pon = ObjectNumber.parse(pid) if not pon or not isinstance(pon, PartitionNumber): raise exc.BadRequest('Partition number {} is not valid'.format(pid)) if str(pon.dataset) != str(don): raise exc.BadRequest('Partition number {} does not belong to datset {}'.format(pid, did)) gr = get_library().get(did) # Need to read the file early, otherwise exceptions here # will result in the cilent's ocket disconnecting. if not gr: raise exc.NotFound('No dataset for id: {}'.format(did)) bundle = gr.bundle partition = bundle.partitions.get(pid) return bundle,partition
def get(self,bp_id): '''Get a bundle or partition Gets a bundle or Partition object, referenced by a string generated by DatasetNumber or PartitionNumber, or by an object that has a name or id_ field. Args: bp_id (Bundle|Partition|str) Specifies a bundle or partition to fetch. bp_id may be: An ObjectNumber string id for a partition or dataset An ObjectNumber object An Identity object, for a partition or bundle Any object that has an 'identity' attribute that is an Identity object Returns: Bundle|Partition ''' from databundles.identity import ObjectNumber, PartitionNumber, Identity from databundles.orm import Dataset from databundles.orm import Partition import sqlalchemy.orm.exc s = self.session if isinstance(bp_id, basestring): bp_id = ObjectNumber.parse(bp_id) elif isinstance(bp_id, ObjectNumber): pass elif isinstance(bp_id, Identity): if not bp_id.id_: raise Exception("Identity does not have an id_ defined") bp_id = ObjectNumber.parse(bp_id.id_) else: # hope that is has an identity field bp_id = ObjectNumber.parse(bp_id.identity.id_) dataset = None partition = None try: if isinstance(bp_id, PartitionNumber): query = s.query(Dataset, Partition).join(Partition).filter(Partition.id_ == str(bp_id)) dataset, partition = query.one(); else: query = s.query(Dataset) query = s.query(Dataset).filter(Dataset.id_ == str(bp_id)) dataset = query.one(); except sqlalchemy.orm.exc.NoResultFound as e: #@UnusedVariable return None, None #raise ResultCountError("Failed to find dataset or partition for: {}".format(str(bp_id))) return dataset, partition
def add_column(self, name, **kwargs): import sqlalchemy.orm.session s = sqlalchemy.orm.session.Session.object_session(self) name = Column.mangle_name(name) if kwargs.get('sequence_id', False): sequence = kwargs['sequence_id'] else: sequence = None row = Column(id=str(ColumnNumber(ObjectNumber.parse(self.id_),sequence)), name=name, t_id=self.id_, **kwargs ) for key, value in kwargs.items(): if key[0] != '_' and key not in ['d_id','t_id','name']: setattr(row, key, value) if isinstance(value, basestring) and len(value) == 0: if key == 'is_primary_key': value = False setattr(row, key, value) s.add(row) if kwargs.get('commit', True): s.commit() return row
def before_update(mapper, conn, target): '''Set the column id number based on the table number and the sequence id for the column''' if target.id_ is None: table_on = ObjectNumber.parse(target.t_id) target.id_ = str(ColumnNumber(table_on, target.sequence_id))
def __init__(self,table, **kwargs): self.sequence_id = kwargs.get("sequence_id",len(table.columns)+1) self.name = kwargs.get("name",None) self.altname = kwargs.get("altname",None) self.is_primary_key = _clean_flag(kwargs.get("is_primary_key",False)) self.datatype = kwargs.get("datatype",None) self.size = kwargs.get("size",None) self.precision = kwargs.get("precision",None) self.width = kwargs.get("width",None) self.sql = kwargs.get("sql",None) self.flags = kwargs.get("flags",None) self.description = kwargs.get("description",None) self.keywords = kwargs.get("keywords",None) self.measure = kwargs.get("measure",None) self.units = kwargs.get("units",None) self.universe = kwargs.get("universe",None) self.scale = kwargs.get("scale",None) self.data = kwargs.get("data",None) # the table_name attribute is not stored. It is only for # building the schema, linking the columns to tables. self.table_name = kwargs.get("table_name",None) if not self.name: raise ValueError('Column must have a name') self.t_id = table.id_ self.t_vid = table.vid ton = ObjectNumber.parse(table.vid) con = ColumnNumber(ton, self.sequence_id) self.vid = str(con) self.id = str(con.rev(None))
def _put(self, source, identity): '''Put the source to the remote, creating a compressed version if it is not originally compressed''' from databundles.util import bundle_file_type import gzip import os, tempfile, uuid from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber id_ = identity.id_ on = ObjectNumber.parse(id_) if not on: raise ValueError("Failed to parse id: '{}'".format(id_)) if not isinstance(on, (DatasetNumber, PartitionNumber)): raise ValueError("Object number '{}' is neither for a dataset nor partition".format(id_)) type_ = bundle_file_type(source) if type_ == 'sqlite' or type_ == 'hdf': import shutil # If it is a plain sqlite file, compress it before sending it. try: cf = os.path.join(tempfile.gettempdir(),str(uuid.uuid4())) with gzip.open(cf, 'wb') as out_f: try: shutil.copyfileobj(source, out_f) except AttributeError: with open(source) as in_f: shutil.copyfileobj(in_f, out_f) with open(cf) as sf_: if isinstance(on,DatasetNumber ): response = self.remote.datasets(id_).put(sf_) else: response = self.remote.datasets(str(on.dataset)).partitions(str(on)).put(sf_) finally: if os.path.exists(cf): os.remove(cf) elif type_ == 'gzip': # the file is already gziped, so nothing to do. if isinstance(on,DatasetNumber ): response = self.remote.datasets(id_).put(source) else: response = self.remote.datasets(str(on.dataset)).partitions(str(on)).put(source) else: raise Exception("Bad file for id {} got type: {} ".format(id_, type_)) raise_for_status(response) return response
def before_update(mapper, conn, target): '''Set the Table ID based on the dataset number and the sequence number for the table ''' if isinstance(target,Column): raise TypeError('Got a column instead of a table') if target.id_ is None: dataset_id = ObjectNumber.parse(target.d_id) target.id_ = str(TableNumber(dataset_id, target.sequence_id))
def _put(self, id_,source): '''Put the source to the remote, creating a compressed version if it is not originally compressed''' from databundles.util import bundle_file_type import gzip import os, tempfile, uuid from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber on = ObjectNumber.parse(id_) if not on: raise ValueError("Failed to parse id: '{}'".format(id_)) if not isinstance(on, (DatasetNumber, PartitionNumber)): raise ValueError("Object number '{}' is neither for a dataset nor partition".format(id_)) type_ = bundle_file_type(source) if type_ == 'sqlite': # If it is a plain sqlite file, compress it before sending it. try: cf = os.path.join(tempfile.gettempdir(),str(uuid.uuid4())) f = gzip.open(cf, 'wb') f.writelines(source) f.close() with open(cf) as source: if isinstance(on,DatasetNumber ): response = self.api.datasets(id_).put(source) else: response = self.api.datasets(str(on.dataset)).partitions(str(on)).put(source) finally: if os.path.exists(cf): os.remove(cf) elif type_ == 'gzip': # the file is already gziped, so nothing to do. if isinstance(on,DatasetNumber ): response = self.api.datasets(id_).put(source) else: response = self.api.datasets(str(on.dataset)).partitions(str(on)).put(source) else: raise Exception("Bad file got type: {} ".format(type_)) raise_for_status(response) return response
def __init__(self,dataset, **kwargs): self.id_ = kwargs.get("id",kwargs.get("id_",None)) self.name = kwargs.get("name",kwargs.get("name",None)) self.vname = kwargs.get("vname",None) self.sequence_id = kwargs.get("sequence_id",None) self.d_id = kwargs.get("d_id",None) self.space = kwargs.get("space",None) self.time = kwargs.get("time",None) self.t_id = kwargs.get("t_id",None) self.grain = kwargs.get('grain',None) self.format = kwargs.get('format',None) self.segment = kwargs.get('segment',None) self.data = kwargs.get('data',None) self.d_id = dataset.id_ self.d_vid = dataset.vid # See before_insert for setting self.vid and self.id_ if self.t_id: don = ObjectNumber.parse(self.d_vid) ton = ObjectNumber.parse(self.t_id) self.t_vid = str(ton.rev( don.revision))
def __init__(self,**kwargs): self.id_ = kwargs.get("oid",kwargs.get("id",kwargs.get("id_", None)) ) self.name = kwargs.get("name",None) self.vname = kwargs.get("vname",None) self.source = kwargs.get("source",None) self.dataset = kwargs.get("dataset",None) self.subset = kwargs.get("subset",None) self.variation = kwargs.get("variation",None) self.creator = kwargs.get("creator",None) self.revision = kwargs.get("revision",None) if not self.id_: dn = DatasetNumber(None, self.revision ) self.vid = str(dn) self.id_ = str(dn.rev(None)) elif not self.vid: self.vid = str(ObjectNumber.parse(self.id_).rev(self.revision))
def before_insert(mapper, conn, target): '''event.listen method for Sqlalchemy to set the seqience_id for this object and create an ObjectNumber value for the id_''' if target.sequence_id is None: sql = text('''SELECT max(p_sequence_id)+1 FROM Partitions WHERE p_d_id = :did''') max_id, = conn.execute(sql, did=target.d_id).fetchone() if not max_id: max_id = 1 target.sequence_id = max_id don = ObjectNumber.parse(target.d_vid) pon = PartitionNumber(don, target.sequence_id) target.vid = str(pon) target.id_ = str(pon.rev(None)) Partition.before_update(mapper, conn, target)
def __init__(self,dataset, **kwargs): self.sequence_id = kwargs.get("sequence_id",None) self.name = kwargs.get("name",None) self.vname = kwargs.get("vname",None) self.altname = kwargs.get("altname",None) self.description = kwargs.get("description",None) self.keywords = kwargs.get("keywords",None) self.data = kwargs.get("data",None) self.d_id = dataset.id_ self.d_vid = dataset.vid don = ObjectNumber.parse(dataset.vid) ton = TableNumber(don, self.sequence_id) self.vid = str(ton) self.id_ = str(ton.rev(None)) if self.name: self.name = self.mangle_name(self.name) self.init_on_load()
def get_ref(self,bp_id): from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity, PartitionIdentity if isinstance(bp_id, Identity): if bp_id.id_: bp_id = bp_id.id_ else: bp_id = bp_id.name # If dataset is not None, it means the file already is in the cache. dataset = None try: on = ObjectNumber.parse(bp_id) if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)): raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id)) dataset, partition = self._get_bundle_path_from_id(bp_id) #@UnusedVariable except: pass # Try it as a dataset name if not dataset: r = self.find(QueryCommand().identity(name = bp_id) ) if len(r) > 1: raise Exception("Got more than one result") elif len(r) == 0: r = None else: r = r.pop() if r: dataset, partition = self._get_bundle_path_from_id(r.id_) # Try the name as a partition name if not dataset: q = self.find(QueryCommand().partition(name = bp_id) ) if q: r = q.pop(0) if r: dataset, partition = self._get_bundle_path_from_id(r.id_) # No luck so far, so now try to get it from the remote library if not dataset and self.remote: import socket try: r = self.remote.find(bp_id) if r: r = r[0] if r.is_partition: dataset = r.as_dataset partition = r else: dataset = r partition = None except socket.error: self.logger.error("Connection to remote ") elif dataset: from identity import new_identity dataset = Identity(**dataset.to_dict()) partition = new_identity(partition.to_dict()) if partition else None if not dataset: return False, False return dataset, partition
def before_update(mapper, conn, target): '''Set the column id number based on the table number and the sequence id for the column''' dataset = ObjectNumber.parse(target.d_id) target.id_ = str(PartitionNumber(dataset, target.sequence_id))
def put_dataset(did): '''Store a bundle, calling put() on the bundle file in the Library. :param did: A dataset id string. must be parsable as a `DatasetNumber` value :rtype: string :param pid: A partition id string. must be parsable as a `partitionNumber` value :rtype: string :param payload: The bundle database file, which may be compressed. :rtype: binary ''' from databundles.identity import ObjectNumber, DatasetNumber import stat try: cf = _read_body(request) size = os.stat(cf).st_size if size == 0: raise exc.BadRequest("Got a zero size dataset file") if not os.path.exists(cf): raise exc.BadRequest("Non existent file") # Now we have the bundle in cf. Stick it in the library. # We're doing these exceptions here b/c if we don't read the body, the # client will get an error with the socket closes. try: on = ObjectNumber.parse(did) except ValueError: raise exc.BadRequest("Unparse dataset id: {}".format(did)) if not isinstance(on, DatasetNumber): raise exc.BadRequest("Bad dataset id, not for a dataset: {}".format(did)) # Is this a partition or a bundle? try: tb = DbBundle(cf) type = tb.db_config.info.type except Exception as e: logger.error("Failed to access database: {}".format(cf)) raise if( type == 'partition'): raise exc.BadRequest("Bad data type: Got a partition") if(tb.identity.id_ != did ): raise exc.BadRequest("""Bad request. Dataset id of URL doesn't match payload. {} != {}""".format(did,tb.identity.id_)) library_path, rel_path, url = get_library().put(tb) #@UnusedVariable identity = tb.identity # if that worked, OK to remove the temporary file. finally : pass #os.remove(cf) r = identity.to_dict() r['url'] = url return r
def get_ref(self,bp_id): from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity if isinstance(bp_id, Identity): if bp_id.id_: bp_id = bp_id.id_ else: bp_id = bp_id.name # If dataset is not None, it means the file already is in the cache. dataset = None try: on = ObjectNumber.parse(bp_id) if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)): raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id)) dataset, partition = self._get_bundle_path_from_id(bp_id) #@UnusedVariable except: pass # Try it as a dataset name if not dataset: r = self.find(QueryCommand().identity(name = bp_id) ).first() if r: dataset, partition = self._get_bundle_path_from_id(r[0].id_) # Try the name as a partition name if not dataset: q = self.find(QueryCommand().partition(name = bp_id) ) r = q.first() if r: dataset, partition = self._get_bundle_path_from_id(r[1].id_) # No luck so far, so now try to get it from the remote library if not dataset and self.api: from databundles.identity import Identity, PartitionIdentity import socket from databundles.orm import Dataset, Partition try: r = self.api.find(bp_id) if r: r = r[0] if hasattr(r, 'Partition') and r.Partition is not None: identity = PartitionIdentity(**(r.Partition._asdict())) dataset = Dataset(**r.Dataset._asdict()) partition = ObjectNumber.parse(r.Partition.id) else: identity = Identity(**(r.Dataset._asdict())) dataset = Dataset(**r.Dataset._asdict()) partition = None except socket.error: self.logger.error("Connection to remote {} failed".format(self.remote)) if not dataset: return False, False return dataset, partition