Beispiel #1
0
 def filter(self, filtering):
     """We can do some special filtering on the results.
     For the moment only minimum coverage and minimum identity."""
     # Conditions #
     if 'min_coverage' in filtering and 'qcovs' not in self.params['-outfmt']:
         raise Exception("Can't filter on minimum coverage because it wasn't included.")
     if 'min_identity' in filtering and 'pident' not in self.params['-outfmt']:
         raise Exception("Can't filter on minimum identity because it wasn't included.")
     # Iterator #
     def filter_lines(blastout):
         cov_threshold = filtering.get('min_coverage', 0.0) * 100
         idy_threshold = filtering.get('min_identity', 0.0) * 100
         cov_position = self.params['-outfmt'].strip('"').split().index('qcovs') - 1
         idy_position = self.params['-outfmt'].strip('"').split().index('pident') - 1
         for line in blastout:
             coverage = float(line.split()[cov_position])
             identity = float(line.split()[idy_position])
             if coverage < cov_threshold: continue
             if identity < idy_threshold: continue
             else: yield line
     # Do it #
     temp_path = new_temp_path()
     with open(temp_path, 'w') as handle: handle.writelines(filter_lines(self.out_path))
     os.remove(self.out_path)
     shutil.move(temp_path, self.out_path)
Beispiel #2
0
 def gblocks(self,
             new_path = None,
             seq_type = 'nucl' or 'prot'):
     """Apply the gblocks filtering algorithm to the alignment.
     See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html
     Need to rename all sequences, because it will complain with long names."""
     # Temporary path #
     if new_path is None: final = self.__class__(new_temp_path())
     else:                final = self.__class__(new_path)
     # Mapping every sequence name with a random name #
     orig_name_to_temp = {seq.description: 'name' + str(i) for i,seq in enumerate(self)}
     temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()}
     # Rename every sequence with a random name #
     temp_fasta = self.rename_sequences(orig_name_to_temp)
     # Options #
     if seq_type == 'nucl': t_option = "-t=d"
     if seq_type == 'prot': t_option = "-t=p"
     # Run it #
     result = sh.gblocks91(temp_fasta.path, t_option, '-p=n', "-b4=3", "-b3=20", "-b5=a", _ok_code=[0,1])
     created_file = temp_fasta.path + '-gb'
     assert os.path.exists(created_file)
     # Check errors #
     if "Execution terminated" in result.stdout: raise Exception("gblocks crashed again.")
     # Back #
     temp_fasta.rename_sequences(temp_name_to_orig, final)
     # Return #
     return final
Beispiel #3
0
 def gblocks(self, new_path=None, seq_type='nucl' or 'prot'):
     """Apply the gblocks filtering algorithm to the alignment.
     See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html
     Need to rename all sequences, because it will complain with long names."""
     # Temporary path #
     if new_path is None: final = self.__class__(new_temp_path())
     else: final = self.__class__(new_path)
     # Mapping every sequence name with a random name #
     orig_name_to_temp = {
         seq.description: 'name' + str(i)
         for i, seq in enumerate(self)
     }
     temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()}
     # Rename every sequence with a random name #
     temp_fasta = self.rename_sequences(orig_name_to_temp)
     # Options #
     if seq_type == 'nucl': t_option = "-t=d"
     if seq_type == 'prot': t_option = "-t=p"
     # Run it #
     result = sh.gblocks91(temp_fasta.path,
                           t_option,
                           '-p=n',
                           "-b4=3",
                           "-b3=20",
                           "-b5=a",
                           _ok_code=[0, 1])
     created_file = temp_fasta.path + '-gb'
     assert os.path.exists(created_file)
     # Check errors #
     if "Execution terminated" in result.stdout:
         raise Exception("gblocks crashed again.")
     # Back #
     temp_fasta.rename_sequences(temp_name_to_orig, final)
     # Return #
     return final
Beispiel #4
0
 def set_size(self, length):
     """Trim all sequences to a specific length starting from the end."""
     self.size_trimmed = FASTA(new_temp_path())
     def trim_iterator(reads):
         for read in reads:
             if len(read) < length: continue
             yield read[-length:]
     self.size_trimmed.write(trim_iterator(self.reads))
     self.size_trimmed.close()
     # Replace it #
     self.reads.remove()
     shutil.move(self.size_trimmed, self.reads)
Beispiel #5
0
 def phred_13_to_18_sed(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
     # String #
     sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
     # Faster with bash utilities #
     if in_place is True:
         sh.sed('-i', sed_command, self.path)
         return self
     # New file #
     if new_path is None: new_fastq = self.__class__(new_temp_path())
     else: new_fastq = self.__class__(new_path)
     sh.sed(sed_command + " " + new_fastq, self.path)
     return new_fastq
Beispiel #6
0
 def extract_sequences(self, ids, new_path=None):
     """Will take all the sequences from the current file who's id appears in
     the ids given and place them in the new file path given."""
     # Temporary path #
     if new_path is None: new_fasta = self.__class__(new_temp_path())
     elif isinstance(new_path, FASTA): new_fasta = new_path
     else:                new_fasta = self.__class__(new_path)
     # Do it #
     new_fasta.create()
     for seq in self:
         if seq.id in ids: new_fasta.add_seq(seq)
     new_fasta.close()
     return new_fasta
Beispiel #7
0
 def phred_13_to_18_sed(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
     # String #
     sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
     # Faster with bash utilities #
     if in_place is True:
         sh.sed('-i', sed_command, self.path)
         return self
     # New file #
     if new_path is None: new_fastq = self.__class__(new_temp_path())
     else:                new_fastq = self.__class__(new_path)
     sh.sed(sed_command + " " + new_fastq, self.path)
     return new_fastq
Beispiel #8
0
 def extract_length(self, lower_bound, upper_bound, new_path=None, cls=None):
     """Extract a certain length fraction and place them in a new file"""
     # Temporary path #
     cls = cls or self.__class__
     fraction = cls(new_temp_path()) if new_path is None else cls(new_path)
     # Generator #
     if lower_bound is None: lower_bound = 0
     def fraction_iterator():
         for read in self:
             if lower_bound <= len(read) <= upper_bound:
                 yield read
     # Do it #
     fraction.write(fraction_iterator())
     fraction.close()
     return fraction
Beispiel #9
0
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file']   = FilePath(os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir             = DirectoryPath(os.path.abspath(base_dir))
         self.script_path          = FilePath(base_dir + "run." + self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file']   = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path())
     if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
Beispiel #10
0
 def remove_trailing_stars(self, new_path=None, in_place=True, check=False):
     """Remove the bad character that can be inserted by some programs at the
     end of sequences."""
     # Optional check #
     if check and int(sh.grep('-c', '\*', self.path, _ok_code=[0,1])) == 0: return self
     # Faster with bash utilities #
     if in_place is True:
         sh.sed('-i', 's/\*$//g', self.path)
         return self
     # Standard way #
     if new_path is None: new_fasta = self.__class__(new_temp_path())
     else:                new_fasta = self.__class__(new_path)
     new_fasta.create()
     for seq in self: new_fasta.add_str(str(seq.seq).rstrip('*'), seq.id)
     new_fasta.close()
     return new_fasta
Beispiel #11
0
 def phred_13_to_18(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
     # New file #
     if new_path is None: new_fastq = self.__class__(new_temp_path(suffix=self.extension))
     else:                new_fastq = self.__class__(new_path)
     # Do it #
     self.format = 'fastq-illumina'
     new_fastq.open('w')
     new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
     new_fastq.close()
     self.format = 'fastq-sanger'
     # Return #
     if in_place:
         os.remove(self.path)
         shutil.move(new_fastq, self.path)
         return self
     else: return new_fastq
Beispiel #12
0
 def extract_length(self, lower_bound=None, upper_bound=None, new_path=None):
     """Extract a certain length fraction and place them in a new file."""
     # Temporary path #
     if new_path is None: fraction = self.__class__(new_temp_path())
     elif isinstance(new_path, FASTA): fraction = new_path
     else:                fraction = self.__class__(new_path)
     # Generator #
     if lower_bound is None: lower_bound = 0
     if upper_bound is None: upper_bound = sys.maxint
     def fraction_iterator():
         for read in self:
             if lower_bound <= len(read) <= upper_bound:
                 yield read
     # Do it #
     fraction.write(fraction_iterator())
     fraction.close()
     return fraction
Beispiel #13
0
 def rename_with_num(self, prefix="", new_path=None, remove_desc=True):
     """Rename every sequence based on a prefix and a number"""
     # Temporary path #
     if new_path is None: numbered = self.__class__(new_temp_path())
     else: numbered = self.__class__(new_path)
     # Generator #
     def numbered_iterator():
         for i,read in enumerate(self):
             read.id = prefix + str(i)
             if remove_desc: read.description = ""
             yield read
     # Do it #
     numbered.write(numbered_iterator())
     numbered.close()
     # Replace it #
     if new_path is None:
         os.remove(self.path)
         shutil.move(numbered, self.path)
Beispiel #14
0
 def rename_sequences(self, mapping, new_path=None, in_place=False):
     """Will rename all sequences in the current fasta file using
     the mapping dictionary also provided. In place or at a new path."""
     # Where is the new file #
     if new_path is None: new_fasta = self.__class__(new_temp_path())
     else:                new_fasta = self.__class__(new_path)
     # Do it #
     new_fasta.create()
     for seq in self:
         new_name = mapping[seq.description]
         nucleotides = str(seq.seq)
         new_fasta.add_str(nucleotides, new_name)
     new_fasta.close()
     # Return #
     if in_place:
         os.remove(self.path)
         shutil.move(new_fasta, self.path)
         return self
     else: return new_fasta
Beispiel #15
0
 def rename_with_prefix(self, prefix="", new_path=None, in_place=True, remove_desc=True):
     """Rename every sequence based on a prefix."""
     # Temporary path #
     if new_path is None: prefixed = self.__class__(new_temp_path())
     else:                prefixed = self.__class__(new_path)
     # Generator #
     def prefixed_iterator():
         for i,read in enumerate(self):
             read.id = prefix + read.id
             if remove_desc: read.description = ""
             yield read
     # Do it #
     prefixed.write(prefixed_iterator())
     prefixed.close()
     # Replace it #
     if in_place:
         os.remove(self.path)
         shutil.move(prefixed, self.path)
     return prefixed
Beispiel #16
0
 def subsample(self, down_to=1, new_path=None):
     """Pick a number of sequences from the file pseudo-randomly."""
     # Auto path #
     if new_path is None: subsampled = self.__class__(new_temp_path())
     elif isinstance(new_path, FASTA): subsampled = new_path
     else:                subsampled = self.__class__(new_path)
     # Check size #
     if down_to > len(self):
         message = "Can't subsample %s down to %i. Only down to %i."
         print Color.ylw + message % (self, down_to, len(self)) + Color.end
         self.copy(new_path)
         return
     # Do it #
     subsampled.create()
     for seq in isubsample(self, down_to): subsampled.add_seq(seq)
     subsampled.close()
     # Did it work #
     assert len(subsampled) == down_to
     return subsampled
Beispiel #17
0
 def phred_13_to_18(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format."""
     # New file #
     if new_path is None:
         new_fastq = self.__class__(new_temp_path(suffix=self.extension))
     else:
         new_fastq = self.__class__(new_path)
     # Do it #
     self.format = 'fastq-illumina'
     new_fastq.open('w')
     new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
     new_fastq.close()
     self.format = 'fastq-sanger'
     # Return #
     if in_place:
         os.remove(self.path)
         shutil.move(new_fastq, self.path)
         return self
     else:
         return new_fastq
Beispiel #18
0
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(
             os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file'] = FilePath(
             os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir = DirectoryPath(os.path.abspath(base_dir))
         self.script_path = FilePath(base_dir + "run." +
                                     self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file'] = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None:
         self.script_path = FilePath(new_temp_path())
     if script_path is not None:
         self.script_path = FilePath(os.path.abspath(script_path))