コード例 #1
0
ファイル: blast.py プロジェクト: xapple/seqsearch
    def filter(self, filtering):
        """
        We can do some special filtering on the results.
        For the moment only minimum coverage and minimum identity.
        """
        # Conditions #
        if 'min_coverage' in filtering and \
           'qcovs' not in self.params['-outfmt']:
            msg = "Can't filter on minimum coverage because it wasn't included."
            raise Exception(msg)
        if 'min_identity' in filtering and \
           'pident' not in self.params['-outfmt']:
            msg = "Can't filter on minimum identity because it wasn't included."
            raise Exception(msg)
        # Iterator #
        def filter_lines(blastout):
            cov_threshold = filtering.get('min_coverage', 0.0) * 100
            idy_threshold = filtering.get('min_identity', 0.0) * 100
            outfmt_str = self.params['-outfmt'].strip('"').split()
            cov_position = outfmt_str.index('qcovs') - 1
            idy_position = outfmt_str.index('pident') - 1
            for line in blastout:
                coverage = float(line.split()[cov_position])
                identity = float(line.split()[idy_position])
                if coverage < cov_threshold: continue
                if identity < idy_threshold: continue
                else: yield line

        # Do it #
        temp_path = new_temp_path()
        with open(temp_path, 'w') as handle:
            handle.writelines(filter_lines(self.out_path))
        os.remove(self.out_path)
        shutil.move(temp_path, self.out_path)
コード例 #2
0
    def rename_with_prefix(self,
                           prefix="",
                           new_path=None,
                           in_place=True,
                           remove_desc=True):
        """Rename every sequence based on a prefix."""
        # Temporary path #
        if new_path is None: prefixed = self.__class__(new_temp_path())
        else: prefixed = self.__class__(new_path)

        # Generator #
        def prefixed_iterator():
            for i, read in enumerate(self):
                read.id = prefix + read.id
                if remove_desc: read.description = ""
                yield read

        # Do it #
        prefixed.write(prefixed_iterator())
        # Replace it #
        if in_place:
            os.remove(self.path)
            shutil.move(prefixed, self.path)
        # Return #
        return prefixed
コード例 #3
0
 def gblocks(self,
             new_path = None,
             seq_type = 'nucl' or 'prot'):
     """Apply the gblocks filtering algorithm to the alignment.
     See http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html
     Need to rename all sequences, because it will complain with long names."""
     # Temporary path #
     if new_path is None: final = self.__class__(new_temp_path())
     else:                final = self.__class__(new_path)
     # Mapping every sequence name with a random name #
     orig_name_to_temp = {seq.description: 'name' + str(i) for i,seq in enumerate(self)}
     temp_name_to_orig = {v: k for k, v in orig_name_to_temp.items()}
     # Rename every sequence with a random name #
     temp_fasta = self.rename_sequences(orig_name_to_temp)
     # Options #
     if seq_type == 'nucl': t_option = "-t=d"
     if seq_type == 'prot': t_option = "-t=p"
     # Run it #
     result = sh.gblocks91(temp_fasta.path, t_option, '-p=n', "-b4=3", "-b3=20", "-b5=a", _ok_code=[0,1])
     created_file = temp_fasta.path + '-gb'
     assert os.path.exists(created_file)
     # Check errors #
     if "Execution terminated" in result.stdout: raise Exception("gblocks crashed again.")
     # Back #
     temp_fasta.rename_sequences(temp_name_to_orig, final)
     # Return #
     return final
コード例 #4
0
    def subsample(self, down_to=1, new_path=None, verbose=True):
        """Pick a given number of sequences from the file pseudo-randomly."""
        # Pick the destination path #
        if new_path is None:
            subsampled = self.__class__(new_temp_path())
        elif isinstance(new_path, FASTA):
            subsampled = new_path
        else:
            subsampled = self.__class__(new_path)
        # Check size #
        if down_to > len(self):
            message = "Can't subsample %s down to %i. Only down to %i."
            print(Color.ylw + message % (self, down_to, len(self)) + Color.end)
            self.copy(new_path)
            return
        # Select verbosity #
        import tqdm
        if verbose: wrapper = lambda x: tqdm.tqdm(x, total=self.count)
        else: wrapper = lambda x: x

        # Generator #
        def iterator():
            for read in wrapper(isubsample(self, down_to)):
                yield read

        # Do it #
        subsampled.write(iterator())
        # Did it work #
        assert len(subsampled) == down_to
        # Return #
        return subsampled
コード例 #5
0
 def sqlite_by_shell(self, destination):
     """Method with shell and a temp file. This is hopefully fast."""
     script_path = new_temp_path()
     self.sqlite_dump_shell(script_path)
     from shell_command import shell_output
     shell_output('sqlite3 -bail -init "%s" "%s" .quit' %
                  (script, destination))
     script.remove()
コード例 #6
0
 def phred_13_to_18_sed(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
     # String #
     sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
     # Faster with bash utilities #
     if in_place is True:
         sh.sed('-i', sed_command, self.path)
         return self
     # New file #
     if new_path is None: new_fastq = self.__class__(new_temp_path())
     else:                new_fastq = self.__class__(new_path)
     sh.sed(sed_command + " " + new_fastq, self.path)
     return new_fastq
コード例 #7
0
 def phred_13_to_18(self, new_path=None, in_place=True):
     """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
     # New file #
     if new_path is None: new_fastq = self.__class__(new_temp_path(suffix=self.extension))
     else:                new_fastq = self.__class__(new_path)
     # Do it #
     self.format = 'fastq-illumina'
     new_fastq.open('w')
     new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
     new_fastq.close()
     self.format = 'fastq-sanger'
     # Return #
     if in_place:
         os.remove(self.path)
         shutil.move(new_fastq, self.path)
         return self
     else: return new_fastq
コード例 #8
0
    def extract_sequences(self,
                          ids,
                          new_path=None,
                          in_place=False,
                          verbose=False):
        """
        Will take all the sequences from the current file who's id appears in
        the ids given and place them in a new file.
        If no path is given, a new temporary path is created and returned.
        If `in_place` is set to True, the original file is removed and replaced
        with the result of the extraction.
        Optionally, the argument `ids` can be a function which has to take
        one string as only input and return True for keeping the sequence and
        False for discarding the sequence.
        """
        # Temporary path #
        if new_path is None: new_fasta = self.__class__(new_temp_path())
        elif isinstance(new_path, FASTA): new_fasta = new_path
        else: new_fasta = self.__class__(new_path)
        # Select verbosity #
        import tqdm
        wrapper = tqdm.tqdm if verbose else lambda x: x

        # Simple generator #
        def simple_match(reads):
            for r in wrapper(reads):
                if r.id in ids: yield r

        # Generator with function #
        def function_match(reads):
            for r in wrapper(reads):
                if ids(r.id): yield r

        # Do it #
        if callable(ids):
            new_fasta.write(function_match(self))
        else:
            new_fasta.write(simple_match(self))
        # Return #
        if in_place:
            os.remove(self.path)
            shutil.move(new_fasta, self.path)
            return self
        else:
            return new_fasta
コード例 #9
0
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(
             os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file'] = FilePath(
             os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir = DirectoryPath(os.path.abspath(base_dir))
         self.script_path = FilePath(base_dir + "run." +
                                     self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file'] = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None:
         self.script_path = FilePath(new_temp_path())
     if script_path is not None:
         self.script_path = FilePath(os.path.abspath(script_path))
コード例 #10
0
 def remove_trailing_stars(self, new_path=None, in_place=True, check=False):
     """
     Remove the bad character that can be inserted by some programs at the
     end of sequences.
     """
     # Optional check #
     if check and int(sh.grep('-c', '\\*', self.path, _ok_code=[0, 1
                                                                ])) == 0:
         return self
     # Faster with bash utilities #
     if in_place is True:
         sh.sed('-i', 's/\\*$//g', self.path)
         return self
     # Standard way #
     if new_path is None: new_fasta = self.__class__(new_temp_path())
     else: new_fasta = self.__class__(new_path)
     new_fasta.create()
     for seq in self:
         new_fasta.add_str(str(seq.seq).rstrip('*'), seq.id)
     new_fasta.close()
     # Return #
     return new_fasta
コード例 #11
0
    def extract_length(self,
                       lower_bound=None,
                       upper_bound=None,
                       new_path=None):
        """Extract a certain length fraction and place them in a new file."""
        # Temporary path #
        if new_path is None: fraction = self.__class__(new_temp_path())
        elif isinstance(new_path, FASTA): fraction = new_path
        else: fraction = self.__class__(new_path)
        # Generator #
        if lower_bound is None: lower_bound = 0
        if upper_bound is None: upper_bound = sys.maxsize

        def fraction_iterator():
            for read in self:
                if lower_bound <= len(read) <= upper_bound:
                    yield read

        # Do it #
        fraction.write(fraction_iterator())
        # Return #
        return fraction
コード例 #12
0
 def rename_sequences(self, mapping, new_path=None, in_place=False):
     """
     Will rename all sequences in the current fasta file using
     the mapping dictionary also provided. In place or at a new path.
     """
     # Where is the new file #
     if new_path is None: new_fasta = self.__class__(new_temp_path())
     else: new_fasta = self.__class__(new_path)
     # Do it #
     new_fasta.create()
     for seq in self:
         new_name = mapping[seq.description]
         nucleotides = str(seq.seq)
         new_fasta.add_str(nucleotides, new_name)
     new_fasta.close()
     # Return #
     if in_place:
         os.remove(self.path)
         shutil.move(new_fasta, self.path)
         return self
     else:
         return new_fasta
コード例 #13
0
    def rename_with_num(self, prefix="", new_path=None, remove_desc=True):
        """Rename every sequence based on a prefix and a number."""
        # Temporary path #
        if new_path is None: numbered = self.__class__(new_temp_path())
        else: numbered = self.__class__(new_path)

        # Generator #
        def numbered_iterator():
            for i, read in enumerate(self):
                read.id = prefix + str(i)
                read.seq = read.seq.upper()
                if remove_desc: read.description = ""
                yield read

        # Do it #
        numbered.write(numbered_iterator())
        # Replace it #
        if new_path is None:
            os.remove(self.path)
            shutil.move(numbered, self.path)
        # Return #
        return numbered
コード例 #14
0
ファイル: access_database.py プロジェクト: xapple/plumbing
 def sqlite_by_shell(self, destination):
     """Method with shell and a temp file. This is hopefully fast."""
     script_path = new_temp_path()
     self.sqlite_dump_shell(script_path)
     shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination))
     script.remove()