def filter(self, filtering): """We can do some special filtering on the results. For the moment only minimum coverage and minimum identity.""" # Conditions # if 'min_coverage' in filtering and 'qcovs' not in self.params['-outfmt']: raise Exception("Can't filter on minimum coverage because it wasn't included.") if 'min_identity' in filtering and 'pident' not in self.params['-outfmt']: raise Exception("Can't filter on minimum identity because it wasn't included.") # Iterator # def filter_lines(blastout): cov_threshold = filtering.get('min_coverage', 0.0) * 100 idy_threshold = filtering.get('min_identity', 0.0) * 100 cov_position = self.params['-outfmt'].strip('"').split().index('qcovs') - 1 idy_position = self.params['-outfmt'].strip('"').split().index('pident') - 1 for line in blastout: coverage = float(line.split()[cov_position]) identity = float(line.split()[idy_position]) if coverage < cov_threshold: continue if identity < idy_threshold: continue else: yield line # Do it # temp_path = new_temp_path() with open(temp_path, 'w') as handle: handle.writelines(filter_lines(self.out_path)) os.remove(self.out_path) shutil.move(temp_path, self.out_path)
def gblocks(self, new_path = None, seq_type = 'nucl' or 'prot'): """Apply the gblocks filtering algorithm to the alignment. See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html Need to rename all sequences, because it will complain with long names.""" # Temporary path # if new_path is None: final = self.__class__(new_temp_path()) else: final = self.__class__(new_path) # Mapping every sequence name with a random name # orig_name_to_temp = {seq.description: 'name' + str(i) for i,seq in enumerate(self)} temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()} # Rename every sequence with a random name # temp_fasta = self.rename_sequences(orig_name_to_temp) # Options # if seq_type == 'nucl': t_option = "-t=d" if seq_type == 'prot': t_option = "-t=p" # Run it # result = sh.gblocks91(temp_fasta.path, t_option, '-p=n', "-b4=3", "-b3=20", "-b5=a", _ok_code=[0,1]) created_file = temp_fasta.path + '-gb' assert os.path.exists(created_file) # Check errors # if "Execution terminated" in result.stdout: raise Exception("gblocks crashed again.") # Back # temp_fasta.rename_sequences(temp_name_to_orig, final) # Return # return final
def gblocks(self, new_path=None, seq_type='nucl' or 'prot'): """Apply the gblocks filtering algorithm to the alignment. See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html Need to rename all sequences, because it will complain with long names.""" # Temporary path # if new_path is None: final = self.__class__(new_temp_path()) else: final = self.__class__(new_path) # Mapping every sequence name with a random name # orig_name_to_temp = { seq.description: 'name' + str(i) for i, seq in enumerate(self) } temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()} # Rename every sequence with a random name # temp_fasta = self.rename_sequences(orig_name_to_temp) # Options # if seq_type == 'nucl': t_option = "-t=d" if seq_type == 'prot': t_option = "-t=p" # Run it # result = sh.gblocks91(temp_fasta.path, t_option, '-p=n', "-b4=3", "-b3=20", "-b5=a", _ok_code=[0, 1]) created_file = temp_fasta.path + '-gb' assert os.path.exists(created_file) # Check errors # if "Execution terminated" in result.stdout: raise Exception("gblocks crashed again.") # Back # temp_fasta.rename_sequences(temp_name_to_orig, final) # Return # return final
def set_size(self, length): """Trim all sequences to a specific length starting from the end.""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads)
def phred_13_to_18_sed(self, new_path=None, in_place=True): """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).""" # String # sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/""" # Faster with bash utilities # if in_place is True: sh.sed('-i', sed_command, self.path) return self # New file # if new_path is None: new_fastq = self.__class__(new_temp_path()) else: new_fastq = self.__class__(new_path) sh.sed(sed_command + " " + new_fastq, self.path) return new_fastq
def extract_sequences(self, ids, new_path=None): """Will take all the sequences from the current file who's id appears in the ids given and place them in the new file path given.""" # Temporary path # if new_path is None: new_fasta = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): new_fasta = new_path else: new_fasta = self.__class__(new_path) # Do it # new_fasta.create() for seq in self: if seq.id in ids: new_fasta.add_seq(seq) new_fasta.close() return new_fasta
def extract_length(self, lower_bound, upper_bound, new_path=None, cls=None): """Extract a certain length fraction and place them in a new file""" # Temporary path # cls = cls or self.__class__ fraction = cls(new_temp_path()) if new_path is None else cls(new_path) # Generator # if lower_bound is None: lower_bound = 0 def fraction_iterator(): for read in self: if lower_bound <= len(read) <= upper_bound: yield read # Do it # fraction.write(fraction_iterator()) fraction.close() return fraction
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath(os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def remove_trailing_stars(self, new_path=None, in_place=True, check=False): """Remove the bad character that can be inserted by some programs at the end of sequences.""" # Optional check # if check and int(sh.grep('-c', '\*', self.path, _ok_code=[0,1])) == 0: return self # Faster with bash utilities # if in_place is True: sh.sed('-i', 's/\*$//g', self.path) return self # Standard way # if new_path is None: new_fasta = self.__class__(new_temp_path()) else: new_fasta = self.__class__(new_path) new_fasta.create() for seq in self: new_fasta.add_str(str(seq.seq).rstrip('*'), seq.id) new_fasta.close() return new_fasta
def phred_13_to_18(self, new_path=None, in_place=True): """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.""" # New file # if new_path is None: new_fastq = self.__class__(new_temp_path(suffix=self.extension)) else: new_fastq = self.__class__(new_path) # Do it # self.format = 'fastq-illumina' new_fastq.open('w') new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) new_fastq.close() self.format = 'fastq-sanger' # Return # if in_place: os.remove(self.path) shutil.move(new_fastq, self.path) return self else: return new_fastq
def extract_length(self, lower_bound=None, upper_bound=None, new_path=None): """Extract a certain length fraction and place them in a new file.""" # Temporary path # if new_path is None: fraction = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): fraction = new_path else: fraction = self.__class__(new_path) # Generator # if lower_bound is None: lower_bound = 0 if upper_bound is None: upper_bound = sys.maxint def fraction_iterator(): for read in self: if lower_bound <= len(read) <= upper_bound: yield read # Do it # fraction.write(fraction_iterator()) fraction.close() return fraction
def rename_with_num(self, prefix="", new_path=None, remove_desc=True): """Rename every sequence based on a prefix and a number""" # Temporary path # if new_path is None: numbered = self.__class__(new_temp_path()) else: numbered = self.__class__(new_path) # Generator # def numbered_iterator(): for i,read in enumerate(self): read.id = prefix + str(i) if remove_desc: read.description = "" yield read # Do it # numbered.write(numbered_iterator()) numbered.close() # Replace it # if new_path is None: os.remove(self.path) shutil.move(numbered, self.path)
def rename_sequences(self, mapping, new_path=None, in_place=False): """Will rename all sequences in the current fasta file using the mapping dictionary also provided. In place or at a new path.""" # Where is the new file # if new_path is None: new_fasta = self.__class__(new_temp_path()) else: new_fasta = self.__class__(new_path) # Do it # new_fasta.create() for seq in self: new_name = mapping[seq.description] nucleotides = str(seq.seq) new_fasta.add_str(nucleotides, new_name) new_fasta.close() # Return # if in_place: os.remove(self.path) shutil.move(new_fasta, self.path) return self else: return new_fasta
def rename_with_prefix(self, prefix="", new_path=None, in_place=True, remove_desc=True): """Rename every sequence based on a prefix.""" # Temporary path # if new_path is None: prefixed = self.__class__(new_temp_path()) else: prefixed = self.__class__(new_path) # Generator # def prefixed_iterator(): for i,read in enumerate(self): read.id = prefix + read.id if remove_desc: read.description = "" yield read # Do it # prefixed.write(prefixed_iterator()) prefixed.close() # Replace it # if in_place: os.remove(self.path) shutil.move(prefixed, self.path) return prefixed
def subsample(self, down_to=1, new_path=None): """Pick a number of sequences from the file pseudo-randomly.""" # Auto path # if new_path is None: subsampled = self.__class__(new_temp_path()) elif isinstance(new_path, FASTA): subsampled = new_path else: subsampled = self.__class__(new_path) # Check size # if down_to > len(self): message = "Can't subsample %s down to %i. Only down to %i." print Color.ylw + message % (self, down_to, len(self)) + Color.end self.copy(new_path) return # Do it # subsampled.create() for seq in isubsample(self, down_to): subsampled.add_seq(seq) subsampled.close() # Did it work # assert len(subsampled) == down_to return subsampled
def phred_13_to_18(self, new_path=None, in_place=True): """Illumina-1.3 format conversion to Illumina-1.8 format.""" # New file # if new_path is None: new_fastq = self.__class__(new_temp_path(suffix=self.extension)) else: new_fastq = self.__class__(new_path) # Do it # self.format = 'fastq-illumina' new_fastq.open('w') new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) new_fastq.close() self.format = 'fastq-sanger' # Return # if in_place: os.remove(self.path) shutil.move(new_fastq, self.path) return self else: return new_fastq
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))