def saveLocal(self, name): ''' Save processed SAM file info to a local compressed pickle file. 'name' can exclude extension: .baga will be added ''' fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name) with _tarfile.open(fileout, "w:gz") as tar: print('Writing to {} . . . '.format(fileout)) for att_name, att in self.__dict__.items(): if isinstance(att, _array): io = _StringIO(att.tostring()) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name = att_name) thisone.size = length tar.addfile(tarinfo = thisone, fileobj = io) else: # try saving everything else here by jsoning try: io = _StringIO() _json.dump(att, io) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name = att_name) thisone.size = length tar.addfile(tarinfo = thisone, fileobj = io) except TypeError: # ignore non-jsonable things like functions # include unicodes, strings, lists etc etc #print('omitting {}'.format(att_name)) pass
def saveLocal(self, name): ''' Save processed SAM file info to a local compressed pickle file. 'name' can exclude extension: .baga will be added ''' fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name) with _tarfile.open(fileout, "w:gz") as tar: print('Writing to {} . . . '.format(fileout)) for att_name, att in self.__dict__.items(): if isinstance(att, _array): io = _StringIO(att.tostring()) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name=att_name) thisone.size = length tar.addfile(tarinfo=thisone, fileobj=io) else: # try saving everything else here by jsoning try: io = _StringIO() _json.dump(att, io) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name=att_name) thisone.size = length tar.addfile(tarinfo=thisone, fileobj=io) except TypeError: # ignore non-jsonable things like functions # include unicodes, strings, lists etc etc #print('omitting {}'.format(att_name)) pass
def __init__(self, reads=False, genome=False, baga=False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError( 'instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)' )
def loadFrombaga(local_path): with _tarfile.open(local_path, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents)
def __init__(self, reads = False, genome = False, baga = False): ''' Initialise with: a baga.PrepareReads.Reads object and, a baga.CollectData.Genome object. OR a path to baga.AlignReads.SAMs (like this one) object that was previously saved. ''' if (reads and genome) and not baga: try: self.read_files = reads.trimmed_read_files except AttributeError: text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.' print(text) try: self.read_files = reads.adaptorcut_read_files except AttributeError: text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.' self.read_files = reads.read_files print(text) print('continuing with these reads . . .') # currently baga CollectData includes path to reads in pairname keys to read file pair values # check and remove here for pairname, files in self.read_files.items(): if _os.path.sep in pairname: self.read_files[pairname.split(_os.path.sep)[-1]] = files del self.read_files[pairname] self.genome_sequence = genome.sequence self.genome_id = genome.id elif baga and not (reads and genome): # for reloading a previous instantiation with _tarfile.open(baga, "r:gz") as tar: for member in tar: contents = _StringIO(tar.extractfile(member).read()) try: # either json serialised conventional objects contents = _json.loads(contents.getvalue()) except ValueError: #print('json failed: {}'.format(member.name)) # or longer python array.array objects contents = _array('c', contents.getvalue()) setattr(self, member.name, contents) else: raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
def saveLocal(self, name = False): ''' Save a reference genome to a local compressed baga file. This saves Internet bandwidth if downloading from NCBI and time if loading a genbank file. 'filename' can exclude extension: .baga will be added A .baga file is mostly Python dictionaries in JSON strings and array.array objects in a tar.gz format. ''' if name: fileout = 'baga.CollectData.Genome-{}.baga'.format(name) else: fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id) with _tarfile.open(fileout, "w:gz") as tar: print('Writing to {} . . . '.format(fileout)) for att_name, att in self.__dict__.items(): if isinstance(att, _array): io = _StringIO(att.tostring()) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name = att_name) thisone.size = length tar.addfile(tarinfo = thisone, fileobj = io) elif isinstance(att, dict) or isinstance(att, str): # ensure only dicts or strings for genome objects but shouldn't be anything else anyway io = _StringIO() _json.dump(att, io) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name = att_name) thisone.size = length tar.addfile(tarinfo = thisone, fileobj = io)
def DL(url, verbose = True): req = _urllib2.urlopen(url) CHUNK = 16 * 1024 * 32 data = _StringIO() c = 0 for chunk in iter(lambda: req.read(CHUNK), ''): c += CHUNK if verbose: print("{:,} bytes".format(c)) data.write(chunk) if verbose: print('Download complete . . .') data.seek(0) return(data)
def saveLocal(self, name=False): ''' Save a reference genome to a local compressed baga file. This saves Internet bandwidth if downloading from NCBI and time if loading a genbank file. 'filename' can exclude extension: .baga will be added A .baga file is mostly Python dictionaries in JSON strings and array.array objects in a tar.gz format. ''' if name: fileout = 'baga.CollectData.Genome-{}.baga'.format(name) else: fileout = 'baga.CollectData.Genome-{}.baga'.format(self.id) with _tarfile.open(fileout, "w:gz") as tar: print('Writing to {} . . . '.format(fileout)) for att_name, att in self.__dict__.items(): if isinstance(att, _array): io = _StringIO(att.tostring()) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name=att_name) thisone.size = length tar.addfile(tarinfo=thisone, fileobj=io) elif isinstance(att, dict) or isinstance(att, str): # ensure only dicts or strings for genome objects but shouldn't be anything else anyway io = _StringIO() _json.dump(att, io) io.seek(0, _os.SEEK_END) length = io.tell() io.seek(0) thisone = _tarfile.TarInfo(name=att_name) thisone.size = length tar.addfile(tarinfo=thisone, fileobj=io)
def DL(url, verbose=True): req = _urllib2.urlopen(url) CHUNK = 16 * 1024 * 32 data = _StringIO() c = 0 for chunk in iter(lambda: req.read(CHUNK), ''): c += CHUNK if verbose: print("{:,} bytes".format(c)) data.write(chunk) if verbose: print('Download complete . . .') data.seek(0) return (data)