Exemple #1
0
def _get_dataset_partition_record(did, pid):
    from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber
    
    don = ObjectNumber.parse(did)
    if not don or not isinstance(don, DatasetNumber):
        raise exc.BadRequest('Dataset number {} is not valid'.format(did))
  
    pon = ObjectNumber.parse(pid)
    if not pon or not isinstance(pon, PartitionNumber):
        raise exc.BadRequest('Partition number {} is not valid'.format(pid))
    
    if str(pon.dataset) != str(don):
        raise exc.BadRequest('Partition number {} does not belong to datset {}'.format(pid, did))
    
    gr =  get_library().get(did)
    
    # Need to read the file early, otherwise exceptions here
    # will result in the cilent's ocket disconnecting. 

    if not gr:
        raise exc.NotFound('No dataset for id: {}'.format(did))

    bundle =  gr.bundle

    partition = bundle.partitions.get(pid)

    return bundle,partition
Exemple #2
0
    def get(self,bp_id):
        '''Get a bundle or partition
        
        Gets a bundle or Partition object, referenced by a string generated by
        DatasetNumber or PartitionNumber, or by an object that has a name
        or id_ field. 
        
        Args:
            bp_id (Bundle|Partition|str) Specifies a bundle or partition to
                fetch. bp_id may be:
                    An ObjectNumber string id for a partition or dataset
                    An ObjectNumber object
                    An Identity object, for a partition or bundle
                    Any object that has an 'identity' attribute that is an Identity object
                    
                
        Returns:
            Bundle|Partition
        
        '''

        from databundles.identity import ObjectNumber, PartitionNumber, Identity
        from databundles.orm import Dataset
        from databundles.orm import Partition
        import sqlalchemy.orm.exc 
        
        s = self.session
    
        if isinstance(bp_id, basestring):
            bp_id = ObjectNumber.parse(bp_id)
        elif isinstance(bp_id, ObjectNumber):
            pass
        elif isinstance(bp_id, Identity):
            if not bp_id.id_:
                raise Exception("Identity does not have an id_ defined")
            bp_id = ObjectNumber.parse(bp_id.id_)
            
        else:
            # hope that is has an identity field
            bp_id = ObjectNumber.parse(bp_id.identity.id_)

        dataset = None
        partition = None

        try:
            if isinstance(bp_id, PartitionNumber):
                query = s.query(Dataset, Partition).join(Partition).filter(Partition.id_ == str(bp_id)) 

                dataset, partition = query.one();
                    
            else:
                query = s.query(Dataset)
                query = s.query(Dataset).filter(Dataset.id_ == str(bp_id)) 

                dataset = query.one();
        except sqlalchemy.orm.exc.NoResultFound as e: #@UnusedVariable
            return None, None
            #raise ResultCountError("Failed to find dataset or partition for: {}".format(str(bp_id)))
        
        return dataset, partition
Exemple #3
0
    def add_column(self, name, **kwargs):

        import sqlalchemy.orm.session
        s = sqlalchemy.orm.session.Session.object_session(self)
        
        name = Column.mangle_name(name)

        if kwargs.get('sequence_id', False):
            sequence = kwargs['sequence_id']
        else:
            sequence = None

        row = Column(id=str(ColumnNumber(ObjectNumber.parse(self.id_),sequence)),
                     name=name, 
                     t_id=self.id_,
                     **kwargs              
                     )
         
        for key, value in kwargs.items():
            if key[0] != '_' and key not in ['d_id','t_id','name']:
                setattr(row, key, value)
            
            if isinstance(value, basestring) and len(value) == 0:
                if key == 'is_primary_key':
                    value = False
                    setattr(row, key, value)
      
        s.add(row)
     
        if kwargs.get('commit', True):
            s.commit()
    
        return row
Exemple #4
0
 def before_update(mapper, conn, target):
     '''Set the column id number based on the table number and the 
     sequence id for the column'''
    
     if target.id_  is None:
         table_on = ObjectNumber.parse(target.t_id)
         target.id_ = str(ColumnNumber(table_on, target.sequence_id))
Exemple #5
0
    def __init__(self,table, **kwargs):

        self.sequence_id = kwargs.get("sequence_id",len(table.columns)+1) 
        self.name = kwargs.get("name",None) 
        self.altname = kwargs.get("altname",None) 
        self.is_primary_key = _clean_flag(kwargs.get("is_primary_key",False))
        self.datatype = kwargs.get("datatype",None) 
        self.size = kwargs.get("size",None) 
        self.precision = kwargs.get("precision",None) 
        self.width = kwargs.get("width",None)    
        self.sql = kwargs.get("sql",None)      
        self.flags = kwargs.get("flags",None) 
        self.description = kwargs.get("description",None) 
        self.keywords = kwargs.get("keywords",None) 
        self.measure = kwargs.get("measure",None) 
        self.units = kwargs.get("units",None) 
        self.universe = kwargs.get("universe",None) 
        self.scale = kwargs.get("scale",None) 
        self.data = kwargs.get("data",None) 

        # the table_name attribute is not stored. It is only for
        # building the schema, linking the columns to tables. 
        self.table_name = kwargs.get("table_name",None) 

        if not self.name:
            raise ValueError('Column must have a name')

        self.t_id = table.id_
        self.t_vid = table.vid
        ton = ObjectNumber.parse(table.vid)
        con = ColumnNumber(ton, self.sequence_id)
        self.vid = str(con)
        self.id = str(con.rev(None))
Exemple #6
0
    def _put(self, source, identity):
        '''Put the source to the remote, creating a compressed version if
        it is not originally compressed'''
        
        from databundles.util import bundle_file_type
        import gzip
        import os, tempfile, uuid
        from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber
        
        id_ = identity.id_
        
        on = ObjectNumber.parse(id_)
 
        if not on:
            raise ValueError("Failed to parse id: '{}'".format(id_))
 
        if not  isinstance(on, (DatasetNumber, PartitionNumber)):
            raise ValueError("Object number '{}' is neither for a dataset nor partition".format(id_))
 
        type_ = bundle_file_type(source)

        if  type_ == 'sqlite' or type_ == 'hdf':
            import shutil
            # If it is a plain sqlite file, compress it before sending it. 
            try:
                cf = os.path.join(tempfile.gettempdir(),str(uuid.uuid4()))
                
                with gzip.open(cf, 'wb') as out_f:
                    try:
                        shutil.copyfileobj(source, out_f)
                    except AttributeError:
                        with open(source) as in_f:
                            shutil.copyfileobj(in_f, out_f)
                        
             
                with open(cf) as sf_:
                    if isinstance(on,DatasetNumber ):
                        response =  self.remote.datasets(id_).put(sf_)
                    else:
                        response =  self.remote.datasets(str(on.dataset)).partitions(str(on)).put(sf_)

            finally:
                if os.path.exists(cf):
                    os.remove(cf)
       
        elif type_ == 'gzip':
            # the file is already gziped, so nothing to do. 

            if isinstance(on,DatasetNumber ):
                response =  self.remote.datasets(id_).put(source)
            else:
                response =  self.remote.datasets(str(on.dataset)).partitions(str(on)).put(source)
            
        else:
            raise Exception("Bad file for id {}  got type: {} ".format(id_, type_))

        raise_for_status(response)
        
        return response
Exemple #7
0
 def before_update(mapper, conn, target):
     '''Set the Table ID based on the dataset number and the sequence number
     for the table '''
     if isinstance(target,Column):
         raise TypeError('Got a column instead of a table')
     
     if target.id_ is None:
         dataset_id = ObjectNumber.parse(target.d_id)
         target.id_ = str(TableNumber(dataset_id, target.sequence_id))
Exemple #8
0
    def _put(self, id_,source):
        '''Put the source to the remote, creating a compressed version if
        it is not originally compressed'''
        
        from databundles.util import bundle_file_type
        import gzip
        import os, tempfile, uuid
        from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber
        
        on = ObjectNumber.parse(id_)
 
        if not on:
            raise ValueError("Failed to parse id: '{}'".format(id_))
 
        if not  isinstance(on, (DatasetNumber, PartitionNumber)):
            raise ValueError("Object number '{}' is neither for a dataset nor partition".format(id_))
 
        type_ = bundle_file_type(source)

        if  type_ == 'sqlite':
            # If it is a plain sqlite file, compress it before sending it. 
            try:
                cf = os.path.join(tempfile.gettempdir(),str(uuid.uuid4()))
                f = gzip.open(cf, 'wb')
                f.writelines(source)
                f.close()
             
                with open(cf) as source:
                    if isinstance(on,DatasetNumber ):
                        response =  self.api.datasets(id_).put(source)
                    else:
                        response =  self.api.datasets(str(on.dataset)).partitions(str(on)).put(source)

            finally:
                if os.path.exists(cf):
                    os.remove(cf)
       
        elif type_ == 'gzip':
            # the file is already gziped, so nothing to do. 

            if isinstance(on,DatasetNumber ):
                response =  self.api.datasets(id_).put(source)
            else:
                response =  self.api.datasets(str(on.dataset)).partitions(str(on)).put(source)
            
        else:
            raise Exception("Bad file got type: {} ".format(type_))


        raise_for_status(response)
        
        return response
Exemple #9
0
 def __init__(self,dataset, **kwargs):
     self.id_ = kwargs.get("id",kwargs.get("id_",None)) 
     self.name = kwargs.get("name",kwargs.get("name",None)) 
     self.vname = kwargs.get("vname",None) 
     self.sequence_id = kwargs.get("sequence_id",None) 
     self.d_id = kwargs.get("d_id",None) 
     self.space = kwargs.get("space",None) 
     self.time = kwargs.get("time",None)  
     self.t_id = kwargs.get("t_id",None) 
     self.grain = kwargs.get('grain',None)
     self.format = kwargs.get('format',None)
     self.segment = kwargs.get('segment',None)
     self.data = kwargs.get('data',None)
     
     self.d_id = dataset.id_
     self.d_vid = dataset.vid
     # See before_insert for setting self.vid and self.id_
     
     if self.t_id:
         don = ObjectNumber.parse(self.d_vid)
         ton = ObjectNumber.parse(self.t_id)
         self.t_vid = str(ton.rev( don.revision))
Exemple #10
0
    def __init__(self,**kwargs):
        self.id_ = kwargs.get("oid",kwargs.get("id",kwargs.get("id_", None)) )
        self.name = kwargs.get("name",None) 
        self.vname = kwargs.get("vname",None) 
        self.source = kwargs.get("source",None) 
        self.dataset = kwargs.get("dataset",None) 
        self.subset = kwargs.get("subset",None) 
        self.variation = kwargs.get("variation",None) 
        self.creator = kwargs.get("creator",None) 
        self.revision = kwargs.get("revision",None) 

        if not self.id_:
            dn = DatasetNumber(None, self.revision )
            self.vid = str(dn)
            self.id_ = str(dn.rev(None))
        elif not self.vid:
            self.vid = str(ObjectNumber.parse(self.id_).rev(self.revision))
Exemple #11
0
 def before_insert(mapper, conn, target):
     '''event.listen method for Sqlalchemy to set the seqience_id for this  
     object and create an ObjectNumber value for the id_'''
     if target.sequence_id is None:
         sql = text('''SELECT max(p_sequence_id)+1 FROM Partitions WHERE p_d_id = :did''')
 
         max_id, = conn.execute(sql, did=target.d_id).fetchone()
   
         if not max_id:
             max_id = 1
             
         target.sequence_id = max_id
         
         
     don = ObjectNumber.parse(target.d_vid)
     pon = PartitionNumber(don, target.sequence_id)
     target.vid = str(pon)
     target.id_ = str(pon.rev(None))
     
     Partition.before_update(mapper, conn, target)
Exemple #12
0
    def __init__(self,dataset, **kwargs):

        self.sequence_id = kwargs.get("sequence_id",None)  
        self.name = kwargs.get("name",None) 
        self.vname = kwargs.get("vname",None) 
        self.altname = kwargs.get("altname",None) 
        self.description = kwargs.get("description",None) 
        self.keywords = kwargs.get("keywords",None) 
        self.data = kwargs.get("data",None) 
        
        self.d_id = dataset.id_
        self.d_vid = dataset.vid
        don = ObjectNumber.parse(dataset.vid)
        ton = TableNumber(don, self.sequence_id)
      
        self.vid = str(ton)
        self.id_ = str(ton.rev(None))

        if self.name:
            self.name = self.mangle_name(self.name)

        self.init_on_load()
Exemple #13
0
    def get_ref(self,bp_id):
        from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity, PartitionIdentity

        if isinstance(bp_id, Identity):
            if bp_id.id_:
                bp_id = bp_id.id_
            else:
                bp_id = bp_id.name
                
        # If dataset is not None, it means the file already is in the cache.
        dataset = None
    
        try:
            on = ObjectNumber.parse(bp_id)

            if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)):
                raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id))
            
            dataset, partition  = self._get_bundle_path_from_id(bp_id) #@UnusedVariable
        except: 
            pass
        
        # Try it as a dataset name
        if not dataset:
            r = self.find(QueryCommand().identity(name = bp_id) )
            
            if len(r) > 1:
                raise Exception("Got more than one result")
            elif len(r) == 0:
                r = None
            else:
                r = r.pop()
            
            if r:
                dataset, partition  = self._get_bundle_path_from_id(r.id_) 
                
        # Try the name as a partition name
        if not dataset:
            q = self.find(QueryCommand().partition(name = bp_id) )
       
            if q:
                r = q.pop(0)
                if r:
                    dataset, partition  = self._get_bundle_path_from_id(r.id_)         

        # No luck so far, so now try to get it from the remote library
        if not dataset and self.remote:
            import socket
         
            try:
                r = self.remote.find(bp_id)

                if r:
                    r = r[0]
                    
                    if r.is_partition:
                        dataset = r.as_dataset
                        partition = r
                    else:
                        dataset = r
                        partition = None


            except socket.error:
                self.logger.error("Connection to remote ")
        elif dataset:
            from identity import new_identity
            dataset = Identity(**dataset.to_dict())
            partition = new_identity(partition.to_dict()) if partition else None
            
        if not dataset:
            return False, False
   
        return  dataset, partition
Exemple #14
0
 def before_update(mapper, conn, target):
     '''Set the column id number based on the table number and the 
     sequence id for the column'''
     dataset = ObjectNumber.parse(target.d_id)
     target.id_ = str(PartitionNumber(dataset, target.sequence_id))
Exemple #15
0
def put_dataset(did): 
    '''Store a bundle, calling put() on the bundle file in the Library.
    
        :param did: A dataset id string. must be parsable as a `DatasetNumber`
        value
        :rtype: string
        
        :param pid: A partition id string. must be parsable as a `partitionNumber`
        value
        :rtype: string
        
        :param payload: The bundle database file, which may be compressed. 
        :rtype: binary
    
    '''
    from databundles.identity import ObjectNumber, DatasetNumber
    import stat


    try:
        cf = _read_body(request)
     
        size = os.stat(cf).st_size
        
        if size == 0:
            raise exc.BadRequest("Got a zero size dataset file")
        
        if not os.path.exists(cf):
            raise exc.BadRequest("Non existent file")
 
        # Now we have the bundle in cf. Stick it in the library. 
        
        # We're doing these exceptions here b/c if we don't read the body, the
        # client will get an error with the socket closes. 
        try:
            on = ObjectNumber.parse(did)
        except ValueError:
            raise exc.BadRequest("Unparse dataset id: {}".format(did))
        
        if not isinstance(on, DatasetNumber):
            raise exc.BadRequest("Bad dataset id, not for a dataset: {}".format(did))
       
        # Is this a partition or a bundle?
        
        try:
            tb = DbBundle(cf)
            type = tb.db_config.info.type
        except Exception as e:
            logger.error("Failed to access database: {}".format(cf))
            raise
            
        if( type == 'partition'):
            raise exc.BadRequest("Bad data type: Got a partition")
       
        if(tb.identity.id_ != did ):
            raise exc.BadRequest("""Bad request. Dataset id of URL doesn't
            match payload. {} != {}""".format(did,tb.identity.id_))
    
        library_path, rel_path, url = get_library().put(tb) #@UnusedVariable

        identity = tb.identity
        
        # if that worked, OK to remove the temporary file. 
    finally :
        pass
        #os.remove(cf)
      
    r = identity.to_dict()
    r['url'] = url
    return r
Exemple #16
0
    def get_ref(self,bp_id):
        from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity
                
        if isinstance(bp_id, Identity):
            if bp_id.id_:
                bp_id = bp_id.id_
            else:
                bp_id = bp_id.name
                
        # If dataset is not None, it means the file already is in the cache.
        dataset = None
    
        try:
            on = ObjectNumber.parse(bp_id)

            if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)):
                raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id))
            
            dataset, partition  = self._get_bundle_path_from_id(bp_id) #@UnusedVariable
        except: 
            pass
        
        # Try it as a dataset name
        if not dataset:
            r = self.find(QueryCommand().identity(name = bp_id) ).first()
            
            if r:
                dataset, partition  = self._get_bundle_path_from_id(r[0].id_) 

        # Try the name as a partition name
        if not dataset:
            q = self.find(QueryCommand().partition(name = bp_id) )
       
            r = q.first()
            if r:
                dataset, partition  = self._get_bundle_path_from_id(r[1].id_)         


        # No luck so far, so now try to get it from the remote library
        if not dataset and self.api:
            from databundles.identity import Identity, PartitionIdentity
            import socket
            from databundles.orm import  Dataset, Partition
            
            try:
                r = self.api.find(bp_id)

                if r:
                    r = r[0]
    
                    if hasattr(r, 'Partition') and r.Partition is not None:
                        identity = PartitionIdentity(**(r.Partition._asdict()))
                        dataset = Dataset(**r.Dataset._asdict())
               
                        partition = ObjectNumber.parse(r.Partition.id)
                    
                    else:
                        identity = Identity(**(r.Dataset._asdict()))
                        dataset = Dataset(**r.Dataset._asdict())
                        partition = None
               
            except socket.error:
                self.logger.error("Connection to remote {} failed".format(self.remote))
 
        if not dataset:
            return False, False
     
        return  dataset, partition