def filter(self, filtering): """ We can do some special filtering on the results. For the moment only minimum coverage and minimum identity. """ # Conditions # if 'min_coverage' in filtering and \ 'qcovs' not in self.params['-outfmt']: msg = "Can't filter on minimum coverage because it wasn't included." raise Exception(msg) if 'min_identity' in filtering and \ 'pident' not in self.params['-outfmt']: msg = "Can't filter on minimum identity because it wasn't included." raise Exception(msg) # Iterator # def filter_lines(blastout): cov_threshold = filtering.get('min_coverage', 0.0) * 100 idy_threshold = filtering.get('min_identity', 0.0) * 100 outfmt_str = self.params['-outfmt'].strip('"').split() cov_position = outfmt_str.index('qcovs') - 1 idy_position = outfmt_str.index('pident') - 1 for line in blastout: coverage = float(line.split()[cov_position]) identity = float(line.split()[idy_position]) if coverage < cov_threshold: continue if identity < idy_threshold: continue else: yield line # Do it # temp_path = new_temp_path() with open(temp_path, 'w') as handle: handle.writelines(filter_lines(self.out_path)) os.remove(self.out_path) shutil.move(temp_path, self.out_path)
def rename_with_prefix(self, prefix="", new_path=None, in_place=True, remove_desc=True): """Rename every sequence based on a prefix.""" # Temporary path # if new_path is None: prefixed = self.__class__(new_temp_path()) else: prefixed = self.__class__(new_path) # Generator # def prefixed_iterator(): for i, read in enumerate(self): read.id = prefix + read.id if remove_desc: read.description = "" yield read # Do it # prefixed.write(prefixed_iterator()) # Replace it # if in_place: os.remove(self.path) shutil.move(prefixed, self.path) # Return # return prefixed
def gblocks(self, new_path = None, seq_type = 'nucl' or 'prot'): """Apply the gblocks filtering algorithm to the alignment. See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html Need to rename all sequences, because it will complain with long names.""" # Temporary path # if new_path is None: final = self.__class__(new_temp_path()) else: final = self.__class__(new_path) # Mapping every sequence name with a random name # orig_name_to_temp = {seq.description: 'name' + str(i) for i,seq in enumerate(self)} temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()} # Rename every sequence with a random name # temp_fasta = self.rename_sequences(orig_name_to_temp) # Options # if seq_type == 'nucl': t_option = "-t=d" if seq_type == 'prot': t_option = "-t=p" # Run it # result = sh.gblocks91(temp_fasta.path, t_option, '-p=n', "-b4=3", "-b3=20", "-b5=a", _ok_code=[0,1]) created_file = temp_fasta.path + '-gb' assert os.path.exists(created_file) # Check errors # if "Execution terminated" in result.stdout: raise Exception("gblocks crashed again.") # Back # temp_fasta.rename_sequences(temp_name_to_orig, final) # Return # return final
def subsample(self, down_to=1, new_path=None, verbose=True): """Pick a given number of sequences from the file pseudo-randomly.""" # Pick the destination path # if new_path is None: subsampled = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): subsampled = new_path else: subsampled = self.__class__(new_path) # Check size # if down_to > len(self): message = "Can't subsample %s down to %i. Only down to %i." print(Color.ylw + message % (self, down_to, len(self)) + Color.end) self.copy(new_path) return # Select verbosity # import tqdm if verbose: wrapper = lambda x: tqdm.tqdm(x, total=self.count) else: wrapper = lambda x: x # Generator # def iterator(): for read in wrapper(isubsample(self, down_to)): yield read # Do it # subsampled.write(iterator()) # Did it work # assert len(subsampled) == down_to # Return # return subsampled
def sqlite_by_shell(self, destination): """Method with shell and a temp file. This is hopefully fast.""" script_path = new_temp_path() self.sqlite_dump_shell(script_path) from shell_command import shell_output shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination)) script.remove()
def phred_13_to_18_sed(self, new_path=None, in_place=True): """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).""" # String # sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/""" # Faster with bash utilities # if in_place is True: sh.sed('-i', sed_command, self.path) return self # New file # if new_path is None: new_fastq = self.__class__(new_temp_path()) else: new_fastq = self.__class__(new_path) sh.sed(sed_command + " " + new_fastq, self.path) return new_fastq
def phred_13_to_18(self, new_path=None, in_place=True): """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.""" # New file # if new_path is None: new_fastq = self.__class__(new_temp_path(suffix=self.extension)) else: new_fastq = self.__class__(new_path) # Do it # self.format = 'fastq-illumina' new_fastq.open('w') new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) new_fastq.close() self.format = 'fastq-sanger' # Return # if in_place: os.remove(self.path) shutil.move(new_fastq, self.path) return self else: return new_fastq
def extract_sequences(self, ids, new_path=None, in_place=False, verbose=False): """ Will take all the sequences from the current file who's id appears in the ids given and place them in a new file. If no path is given, a new temporary path is created and returned. If `in_place` is set to True, the original file is removed and replaced with the result of the extraction. Optionally, the argument `ids` can be a function which has to take one string as only input and return True for keeping the sequence and False for discarding the sequence. """ # Temporary path # if new_path is None: new_fasta = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): new_fasta = new_path else: new_fasta = self.__class__(new_path) # Select verbosity # import tqdm wrapper = tqdm.tqdm if verbose else lambda x: x # Simple generator # def simple_match(reads): for r in wrapper(reads): if r.id in ids: yield r # Generator with function # def function_match(reads): for r in wrapper(reads): if ids(r.id): yield r # Do it # if callable(ids): new_fasta.write(function_match(self)) else: new_fasta.write(simple_match(self)) # Return # if in_place: os.remove(self.path) shutil.move(new_fasta, self.path) return self else: return new_fasta
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def remove_trailing_stars(self, new_path=None, in_place=True, check=False): """ Remove the bad character that can be inserted by some programs at the end of sequences. """ # Optional check # if check and int(sh.grep('-c', '\\*', self.path, _ok_code=[0, 1 ])) == 0: return self # Faster with bash utilities # if in_place is True: sh.sed('-i', 's/\\*$//g', self.path) return self # Standard way # if new_path is None: new_fasta = self.__class__(new_temp_path()) else: new_fasta = self.__class__(new_path) new_fasta.create() for seq in self: new_fasta.add_str(str(seq.seq).rstrip('*'), seq.id) new_fasta.close() # Return # return new_fasta
def extract_length(self, lower_bound=None, upper_bound=None, new_path=None): """Extract a certain length fraction and place them in a new file.""" # Temporary path # if new_path is None: fraction = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): fraction = new_path else: fraction = self.__class__(new_path) # Generator # if lower_bound is None: lower_bound = 0 if upper_bound is None: upper_bound = sys.maxsize def fraction_iterator(): for read in self: if lower_bound <= len(read) <= upper_bound: yield read # Do it # fraction.write(fraction_iterator()) # Return # return fraction
def rename_sequences(self, mapping, new_path=None, in_place=False): """ Will rename all sequences in the current fasta file using the mapping dictionary also provided. In place or at a new path. """ # Where is the new file # if new_path is None: new_fasta = self.__class__(new_temp_path()) else: new_fasta = self.__class__(new_path) # Do it # new_fasta.create() for seq in self: new_name = mapping[seq.description] nucleotides = str(seq.seq) new_fasta.add_str(nucleotides, new_name) new_fasta.close() # Return # if in_place: os.remove(self.path) shutil.move(new_fasta, self.path) return self else: return new_fasta
def rename_with_num(self, prefix="", new_path=None, remove_desc=True): """Rename every sequence based on a prefix and a number.""" # Temporary path # if new_path is None: numbered = self.__class__(new_temp_path()) else: numbered = self.__class__(new_path) # Generator # def numbered_iterator(): for i, read in enumerate(self): read.id = prefix + str(i) read.seq = read.seq.upper() if remove_desc: read.description = "" yield read # Do it # numbered.write(numbered_iterator()) # Replace it # if new_path is None: os.remove(self.path) shutil.move(numbered, self.path) # Return # return numbered
def sqlite_by_shell(self, destination): """Method with shell and a temp file. This is hopefully fast.""" script_path = new_temp_path() self.sqlite_dump_shell(script_path) shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination)) script.remove()