Ejemplo n.º 1
0
    def make_master_fasta(self, output_fasta=None, force=None):
        '''
        Directly create a fasta file from the master sequences of each
        cluster of a list.

        @param:    output_fasta
        @pdef:     name of the Fasta file
        @pdefault: input fasta + threshold + '.fa'
        @ptype:    {String}

        @param:    force
        @pdef:     overwrite previous files with the same name
        @pdefault: _SBIglobals.overwrite_
        @ptype:    {Boolean}

        @return: {Fasta}
        '''
        if output_fasta is None:
            output_fasta = self.master_fasta_file
        clist = self.parse()
        prots = set()
        fs = Fasta(self._input)
        for c in clist.clusters:
            prots.add(c.master.name)
        seq = fs.retrieve(prots)
        return Fasta.build_multifasta(output_fasta, seq, force=force)
Ejemplo n.º 2
0
    def execute_query(self, query_file = None, blast_output_file = None):
        if isinstance(query_file, basestring) or isinstance(query_file, File):
            newFasta = Fasta(fasta_file = query_file)
        elif isinstance(query_file, Fasta):
            newFasta = query_file

        if newFasta.is_multifasta:
            raise BE(code = -4, value = newFasta.file.full)

        newFasta.load()
        if len(re.sub(r'[Xx]','',newFasta.sequences[0].sequence)) == 0: #All the sequence is unknown, it will crash blast
            return BR.BlastResult(queryname=newFasta.sequences[0].id, querylength=len(newFasta.sequences[0].sequence))

        if blast_output_file is None:
            temp_output_name = os.path.join(os.getcwd(), newFasta.file.prefix + "." + str(os.getpid()) + ".blast.xml.out")
        else:
            temp_output_name = blast_output_file

        self._execute(input_file = newFasta, output_file = temp_output_name)

        BlastResult = self._parse_blast(newFasta.sequences[0].sequence, temp_output_name)

        if self.clean_files:
            self._clean([temp_output_name])

        return BlastResult
Ejemplo n.º 3
0
 def get_PDBseq_filtered(self, resolution_threshold, output_file):
     resolutions = self.get_resolutions()
     names = [
         k for k, v in resolutions.iteritems()
         if float(v) <= float(resolution_threshold)
     ]
     sequences = Fasta(os.path.join(self.PDBseq, 'PDBseq.fa'))
     selectedseq = sequences.retrieve(copy.deepcopy(names), prefix_size=4)
     return Fasta.build_multifasta(output_file, selectedseq, True)
Ejemplo n.º 4
0
    def execute_query(self, query_file, blast_output_file = None,
                      work_directory = os.getcwd()):
        '''
        Execute BLAST given a query sequence.

        @param:    query_file
        @pdef:     Fasta file with the query sequence.
        @pdefault: 'QuerySequence'
        @ptype:    {String} or {File} or {Fasta}

        @param:    blast_output_file
        @pdef:     name of the temporary BLAST output file.
        @pdefault: query_file.prefix + job.pid + .blast.xml.out
        @ptype:    {String}

        @param:    work_directory
        @pdef:     Directory to which the temporary files will be created.
        @pdefault: Current working directory.
        @ptype:    {String}

        @raises: {AttributeError} if query_file is multi-fasta.
        @raises: {BlastError} in BLAST execution or output parsing errors.

        @returns: {BlastResult}
        '''
        if isinstance(query_file, basestring) or isinstance(query_file, File):
            newFasta = Fasta(fasta_file = query_file)
        elif isinstance(query_file, Fasta):
            newFasta = query_file

        if newFasta.is_multifasta:
            msg = 'Blasts can only be executed one at a time due to XML output restrictions.'
            raise AttributeError(msg)

        # All the sequence is unknown, it will crash blast
        newFasta.load()
        query_sequence = newFasta.sequence
        if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0:
            SBIg.warn(self, 'Created an empty BlastResult.')
            return BlastResult(query_name     = query_sequence.id,
                               query_sequence = query_sequence.sequence)

        Path.mkdir(work_directory)
        file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())])
        file_prefixes = os.path.join(work_directory, file_prefixes)
        tmp_output    = file_prefixes + ".blast.xml.out"

        tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file)

        self._execute(input_file = newFasta, output_file = tmp_output)

        blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output)

        self._clean([tmp_output, ])

        return blast_result
Ejemplo n.º 5
0
    def get_FASTA_IDX_by_names_to_file(self, names, outfile):

        fastafile = Fasta(self.PDBseq)
        selectedfasta = fastafile.retrieve(copy.deepcopy(names))
        output_fasta = File(outfile, 'w')
        for sequence in selectedfasta:
            output_fasta.write(sequence.format('FASTA') + "\n")
        output_fasta.close()
        idxfile = self.PDBseq + '.idx'
        output_idx = File(outfile + '.idx', 'w')
        input_idx = File(idxfile, 'r')
        for line in input_idx.descriptor:
            info = line.split()
            pdbname = info[0][1:]
            if pdbname in names:
                output_idx.write(line)
        input_idx.close()
        output_idx.close()
Ejemplo n.º 6
0
    def execute_query_seq(self, sequenceID = None, sequence = None, blast_input_file = None, blast_output_file = None):
        if sequenceID is None and sequence is None:
            raise AttributeError('Either a sequence or ID is needed to perform the blast')

        if isinstance(sequenceID, (list,set,tuple)):
            raise AttributeError('Blasts can only be executed one at a time due to XML output restrictions')

        if sequenceID is None:
            sequenceID = 'QuerySequence'
        #Given only a code implies that the protein of interest is in the database itself
        if sequence is None:
            grabbedSequence = self._database.retrieve(sequenceID)
            sequenceID = grabbedSequence.id
            sequence   = grabbedSequence.sequence

        if len(re.sub(r'[Xx]','',sequence)) == 0: #All the sequence is unknown, it will crash blast
            return BR.BlastResult(queryname=sequenceID, querylength=len(sequence))

        file_prefixes    = ".".join([str(os.getpid()), str(int(time.clock()*100000))])
        if blast_input_file is None:
            temp_input_name  = os.path.join(os.getcwd(), file_prefixes + ".tmp.fa")
        else:
            temp_input_name  = blast_input_file
        if blast_output_file is None:
            temp_output_name = os.path.join(os.getcwd(), file_prefixes + ".blast.xml.out")
        else:
            temp_output_name = blast_output_file

        QueryFasta = Fasta.build(file_name = temp_input_name, sequenceID = sequenceID, sequence = sequence, force = True)

        self._execute(input_file = QueryFasta, output_file = temp_output_name)

        BlastResult = self._parse_blast(sequence, temp_output_name)

        if self.clean_files:
            self._clean([temp_input_name,temp_output_name])

        return BlastResult
Ejemplo n.º 7
0
    def _check_database(self, database):
        '''
        Ensures that the given database to blast upon exists and that it is
        formated for blast.
        I also sets the index file for the database if it exists.

        @param:    database
        @pdef:     database to blast upon.
        @ptype:    {String}

        @returns: {Fasta} object pointed to the database fasta file.
        '''
        # Database file does not exist
        if not os.path.isfile(database):
            return self._error.database_does_not_exist(database)

        # Database is not formated (if dbformatexe is added in the
        # configuration path it will be auto-formated)
        formatdb_files = []
        formatdb_sufix = BlastExe._EXEC_TYPES[self._search_type]

        for sufix in formatdb_sufix:
            if not os.path.isfile(database + sufix):
                formatdb_files.append(database + sufix)

        if len(formatdb_files) > 0:
            try:
                self._format_database(database)
            except ConfigParser.NoOptionError as e:
                raise self._error.no_blast_format_exe(e)
            except SystemError as e:
                raise self._error.wrong_db_format(database, e)

        idx       = os.path.abspath(database) + ".idx"
        self._idx = idx if os.path.isfile(idx) else None

        return Fasta(fasta_file = database)
Ejemplo n.º 8
0
    def execute_query_seq(self, sequenceID = None, sequence          = None,
                          blast_input_file = None, blast_output_file = None,
                          work_directory   = os.getcwd()):
        '''
        Execute BLAST given a query sequence.

        @param:    sequenceID
        @pdef:     name of the query sequence.
        @pdefault: 'QuerySequence'
        @pclash:   If sequence is not provided, it assumes that the sequenceID
                   belongs to a protein in the database and, thus, it searches
                   for it. Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    sequence
        @pdef:     query sequence.
        @pdefault: _None_
        @pclash:   Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    blast_input_file
        @pdef:     name of the temporary fasta file to use as BLAST input.
        @pdefault: job.pid + clock + .tmp.fa
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     name of the temporary BLAST output file.
        @pdefault: job.pid + clock + .blast.xml.out
        @ptype:    {String}

        @param:    work_directory
        @pdef:     Directory to which the temporary files will be created.
        @pdefault: Current working directory.
        @ptype:    {String}

        @raises: {AttributeError} if neither sequenceID nor sequence are
                  provided or if sequenceID is a list of sequence names.
        @raises: {BlastError} in BLAST execution or output parsing errors.

        @returns: {BlastResult}
        '''
        if sequenceID is None and sequence is None:
            msg = 'Either a sequence or sequenceID is needed to perform the blast.'
            raise AttributeError(msg)

        if isinstance(sequenceID, (list, set, tuple)):
            msg = 'Blasts can only be executed one at a time due to XML output restrictions.'
            raise AttributeError(msg)

        sequenceID = 'QuerySequence' if sequenceID is None else sequenceID

        # Given only a code implies that the protein of interest is in the
        # database itself
        if sequence is None:
            grabbedSequence = self._database.retrieve(sequenceID)
            sequenceID      = grabbedSequence[0].id
            sequence        = grabbedSequence[0].sequence

        # All the sequence is unknown, it will crash blast
        if len(re.sub(r'[Xx]', '', sequence)) == 0:
            SBIg.warn(self, 'Created an empty BlastResult.')
            return BlastResult(query_name     = sequenceID,
                               query_sequence = sequence)

        Path.mkdir(work_directory)
        file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))])
        file_prefixes = os.path.join(work_directory, file_prefixes)
        tmp_input     = file_prefixes + ".tmp.fa"
        tmp_output    = file_prefixes + ".blast.xml.out"

        tmp_input  = tmp_input  if blast_input_file  is None else os.path.join(work_directory, blast_input_file)
        tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file)

        QueryFasta = Fasta.build(file_name = tmp_input, sequence_id = sequenceID,
                                 sequence  = sequence,  force       = True)

        self._execute(input_file = QueryFasta, output_file = tmp_output)

        blast_result = self._parse_blast(sequence, tmp_output)

        self._clean([tmp_input, tmp_output])

        return blast_result
Ejemplo n.º 9
0
    def execute_query_seq(self,
                          sequenceID=None,
                          sequence=None,
                          hmmer_input_file=None,
                          hmmer_output_file=None,
                          work_directory=os.getcwd()):
        '''
        Execute BLAST given a query sequence.

        @param:    sequenceID
        @pdef:     name of the query sequence.
        @pdefault: 'QuerySequence'
        @pclash:   If sequence is not provided, it assumes that the sequenceID
                   belongs to a protein in the database and, thus, it searches
                   for it. Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    sequence
        @pdef:     query sequence.
        @pdefault: _None_
        @pclash:   Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    hmmer_input_file
        @pdef:     name of the temporary fasta file to use as HMMER input.
        @pdefault: job.pid + clock + .tmp.fa
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     name of the temporary HMMER output file.
        @pdefault: job.pid + clock + .hmmer.out
        @ptype:    {String}

        @param:    work_directory
        @pdef:     Directory to which the temporary files will be created.
        @pdefault: Current working directory.
        @ptype:    {String}

        @raises: {AttributeError} if neither sequenceID nor sequence are
                  provided or if sequenceID is a list of sequence names.
        @raises: {HmmError} in HMMER execution or output parsing errors.

        @returns: {HmmResult}
        '''
        if sequenceID is None or sequence is None:
            msg = 'Both a sequence and sequenceID are needed to perform the blast.'
            raise AttributeError(msg)

        if isinstance(sequenceID, (list, set, tuple)):
            msg = 'Blasts can only be executed one at a time due to parse restrictions.'
            raise AttributeError(msg)

        # All the sequence is unknown, it will crash blast
        if len(re.sub(r'[Xx]', '', sequence)) == 0:
            SBIg.warn(self, 'Created an empty BlastResult.')
            return HmmResult(query_name=sequenceID,
                             query_sequence=sequence,
                             database=self.database)

        file_prefixes = ".".join(
            [str(os.getpid()),
             str(int(time.clock() * 100000))])
        file_prefixes = os.path.join(work_directory, file_prefixes)
        tmp_input = file_prefixes + ".tmp.fa"
        tmp_output = file_prefixes + ".blast.xml.out"

        tmp_input = tmp_input if hmmer_input_file is None else hmmer_input_file
        tmp_output = tmp_output if hmmer_output_file is None else hmmer_output_file

        QueryFasta = Fasta.build(file_name=tmp_input,
                                 sequence_id=sequenceID,
                                 sequence=sequence,
                                 force=True)

        self._execute(input_file=QueryFasta, output_file=tmp_output)

        hmmer_result = self._parse_hmmer(sequenceID, sequence, tmp_output)

        self._clean([tmp_input, tmp_output])

        return hmmer_result
Ejemplo n.º 10
0
        formatdb_files = []
        if self._search_type == 'nucl':     formatdb_sufix = ['.nhr','.nin','.nsq']
        elif self._search_type == 'prot':   formatdb_sufix = ['.phr','.pin','.psq']
        for sufix in formatdb_sufix:
            if not os.path.isfile(database + sufix):
                formatdb_files.append(database + sufix)

        if len(formatdb_files) > 0:
            try:
                dbexe  = Executable(executable    = self._configurator.get('blast','dbformatexe'),
                                    path          = self._configurator.get('blast','path'),
                                    variable_path = self._configurator.get('blast','variable_path'))

                SBIglobals.alert('debug', self, 'Trying to format de DB {0} to perform a blast search.\n'.format(database))

                dbexe.add_attribute(database, '-in')
                dbexe.add_attribute(self._search_type, '-dbtype')

                SBIglobals.alert('debug', self, 'Executing command {0}\n'.format(dbexe))

                dbexe.execute()

            except ConfigParser.NoOptionError, e:
                raise BE(code = -6, value = formatdb_files)
            except SystemError, e:
                raise BE(code = -11, value = e)

        return Fasta(fasta_file = database)