Beispiel #1
0
 def saveLocal(self, name):
     '''
     Save processed SAM file info to a local compressed pickle file.
     'name' can exclude extension: .baga will be added
     '''
     fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name)
     
     with _tarfile.open(fileout, "w:gz") as tar:
         print('Writing to {} . . . '.format(fileout))
         for att_name, att in self.__dict__.items():
             if isinstance(att, _array):
                 io = _StringIO(att.tostring())
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
             else:
                 # try saving everything else here by jsoning
                 try:
                     io = _StringIO()
                     _json.dump(att, io)
                     io.seek(0, _os.SEEK_END)
                     length = io.tell()
                     io.seek(0)
                     thisone = _tarfile.TarInfo(name = att_name)
                     thisone.size = length
                     tar.addfile(tarinfo = thisone, fileobj = io)
                 except TypeError:
                     # ignore non-jsonable things like functions
                     # include unicodes, strings, lists etc etc
                     #print('omitting {}'.format(att_name))
                     pass
Beispiel #2
0
    def saveLocal(self, name):
        '''
        Save processed SAM file info to a local compressed pickle file.
        'name' can exclude extension: .baga will be added
        '''
        fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name)

        with _tarfile.open(fileout, "w:gz") as tar:
            print('Writing to {} . . . '.format(fileout))
            for att_name, att in self.__dict__.items():
                if isinstance(att, _array):
                    io = _StringIO(att.tostring())
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
                else:
                    # try saving everything else here by jsoning
                    try:
                        io = _StringIO()
                        _json.dump(att, io)
                        io.seek(0, _os.SEEK_END)
                        length = io.tell()
                        io.seek(0)
                        thisone = _tarfile.TarInfo(name=att_name)
                        thisone.size = length
                        tar.addfile(tarinfo=thisone, fileobj=io)
                    except TypeError:
                        # ignore non-jsonable things like functions
                        # include unicodes, strings, lists etc etc
                        #print('omitting {}'.format(att_name))
                        pass
Beispiel #3
0
    def __init__(self, reads=False, genome=False, baga=False):
        '''
        Initialise with:
        a baga.PrepareReads.Reads object and,
        a baga.CollectData.Genome object.
        
        OR
        
        a path to baga.AlignReads.SAMs (like this one) object that 
        was previously saved.
        '''

        if (reads and genome) and not baga:
            try:
                self.read_files = reads.trimmed_read_files
            except AttributeError:
                text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
                print(text)
                try:
                    self.read_files = reads.adaptorcut_read_files
                except AttributeError:
                    text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                    self.read_files = reads.read_files
                    print(text)
                    print('continuing with these reads . . .')

            # currently baga CollectData includes path to reads in pairname keys to read file pair values
            # check and remove here
            for pairname, files in self.read_files.items():
                if _os.path.sep in pairname:
                    self.read_files[pairname.split(_os.path.sep)[-1]] = files
                    del self.read_files[pairname]

            self.genome_sequence = genome.sequence
            self.genome_id = genome.id

        elif baga and not (reads and genome):
            # for reloading a previous instantiation
            with _tarfile.open(baga, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        #print('json failed: {}'.format(member.name))
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)

        else:
            raise NameError(
                'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)'
            )
Beispiel #4
0
 def loadFrombaga(local_path):
     with _tarfile.open(local_path, "r:gz") as tar:
         for member in tar:
             contents = _StringIO(tar.extractfile(member).read())
             try:
                 # either json serialised conventional objects
                 contents = _json.loads(contents.getvalue())
             except ValueError:
                 # or longer python array.array objects
                 contents = _array('c', contents.getvalue())
             
             setattr(self, member.name, contents)
Beispiel #5
0
        def loadFrombaga(local_path):
            with _tarfile.open(local_path, "r:gz") as tar:
                for member in tar:
                    contents = _StringIO(tar.extractfile(member).read())
                    try:
                        # either json serialised conventional objects
                        contents = _json.loads(contents.getvalue())
                    except ValueError:
                        # or longer python array.array objects
                        contents = _array('c', contents.getvalue())

                    setattr(self, member.name, contents)
Beispiel #6
0
 def __init__(self, reads = False, genome = False, baga = False):
     '''
     Initialise with:
     a baga.PrepareReads.Reads object and,
     a baga.CollectData.Genome object.
     
     OR
     
     a path to baga.AlignReads.SAMs (like this one) object that 
     was previously saved.
     '''
     
     if (reads and genome) and not baga:
         try:
             self.read_files = reads.trimmed_read_files
         except AttributeError:
             text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
             print(text)
             try:
                 self.read_files = reads.adaptorcut_read_files
             except AttributeError:
                 text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
                 self.read_files = reads.read_files
                 print(text)
                 print('continuing with these reads . . .')
         
         # currently baga CollectData includes path to reads in pairname keys to read file pair values
         # check and remove here
         for pairname, files in self.read_files.items():
             if _os.path.sep in pairname:
                 self.read_files[pairname.split(_os.path.sep)[-1]] = files
                 del self.read_files[pairname]
         
         self.genome_sequence = genome.sequence
         self.genome_id = genome.id
     
     elif baga and not (reads and genome):
         # for reloading a previous instantiation
         with _tarfile.open(baga, "r:gz") as tar:
             for member in tar:
                 contents = _StringIO(tar.extractfile(member).read())
                 try:
                     # either json serialised conventional objects
                     contents = _json.loads(contents.getvalue())
                 except ValueError:
                     #print('json failed: {}'.format(member.name))
                     # or longer python array.array objects
                     contents = _array('c', contents.getvalue())
                 
                 setattr(self, member.name, contents)
         
     else:
         raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
Beispiel #7
0
 def saveLocal(self, name = False):
     '''
     Save a reference genome to a local compressed baga file. This saves 
     Internet bandwidth if downloading from NCBI and time if loading a 
     genbank file.
     'filename' can exclude extension: .baga will be added
     A .baga file is mostly Python dictionaries in JSON strings and
     array.array objects in a tar.gz format.
     '''
     
     if name:
         fileout = 'baga.CollectData.Genome-{}.baga'.format(name)
     else:
         fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id)
     
     with _tarfile.open(fileout, "w:gz") as tar:
         print('Writing to {} . . . '.format(fileout))
         for att_name, att in self.__dict__.items():
             if isinstance(att, _array):
                 io = _StringIO(att.tostring())
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
             elif isinstance(att, dict) or isinstance(att, str):
                 # ensure only dicts or strings for genome objects but shouldn't be anything else anyway
                 io = _StringIO()
                 _json.dump(att, io)
                 io.seek(0, _os.SEEK_END)
                 length = io.tell()
                 io.seek(0)
                 thisone = _tarfile.TarInfo(name = att_name)
                 thisone.size = length
                 tar.addfile(tarinfo = thisone, fileobj = io)
Beispiel #8
0
 def DL(url, verbose = True):
     req = _urllib2.urlopen(url)
     CHUNK = 16 * 1024 * 32
     data = _StringIO()
     c = 0
     for chunk in iter(lambda: req.read(CHUNK), ''):
         c += CHUNK
         if verbose:
             print("{:,} bytes".format(c))
         data.write(chunk)
     
     if verbose:
         print('Download complete . . .')
     data.seek(0)
     return(data)
Beispiel #9
0
    def saveLocal(self, name=False):
        '''
        Save a reference genome to a local compressed baga file. This saves 
        Internet bandwidth if downloading from NCBI and time if loading a 
        genbank file.
        'filename' can exclude extension: .baga will be added
        A .baga file is mostly Python dictionaries in JSON strings and
        array.array objects in a tar.gz format.
        '''

        if name:
            fileout = 'baga.CollectData.Genome-{}.baga'.format(name)
        else:
            fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id)

        with _tarfile.open(fileout, "w:gz") as tar:
            print('Writing to {} . . . '.format(fileout))
            for att_name, att in self.__dict__.items():
                if isinstance(att, _array):
                    io = _StringIO(att.tostring())
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
                elif isinstance(att, dict) or isinstance(att, str):
                    # ensure only dicts or strings for genome objects but shouldn't be anything else anyway
                    io = _StringIO()
                    _json.dump(att, io)
                    io.seek(0, _os.SEEK_END)
                    length = io.tell()
                    io.seek(0)
                    thisone = _tarfile.TarInfo(name=att_name)
                    thisone.size = length
                    tar.addfile(tarinfo=thisone, fileobj=io)
Beispiel #10
0
        def DL(url, verbose=True):
            req = _urllib2.urlopen(url)
            CHUNK = 16 * 1024 * 32
            data = _StringIO()
            c = 0
            for chunk in iter(lambda: req.read(CHUNK), ''):
                c += CHUNK
                if verbose:
                    print("{:,} bytes".format(c))
                data.write(chunk)

            if verbose:
                print('Download complete . . .')
            data.seek(0)
            return (data)