def make_master_fasta(self, output_fasta=None, force=None): ''' Directly create a fasta file from the master sequences of each cluster of a list. @param: output_fasta @pdef: name of the Fasta file @pdefault: input fasta + threshold + '.fa' @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' if output_fasta is None: output_fasta = self.master_fasta_file clist = self.parse() prots = set() fs = Fasta(self._input) for c in clist.clusters: prots.add(c.master.name) seq = fs.retrieve(prots) return Fasta.build_multifasta(output_fasta, seq, force=force)
def execute_query(self, query_file = None, blast_output_file = None): if isinstance(query_file, basestring) or isinstance(query_file, File): newFasta = Fasta(fasta_file = query_file) elif isinstance(query_file, Fasta): newFasta = query_file if newFasta.is_multifasta: raise BE(code = -4, value = newFasta.file.full) newFasta.load() if len(re.sub(r'[Xx]','',newFasta.sequences[0].sequence)) == 0: #All the sequence is unknown, it will crash blast return BR.BlastResult(queryname=newFasta.sequences[0].id, querylength=len(newFasta.sequences[0].sequence)) if blast_output_file is None: temp_output_name = os.path.join(os.getcwd(), newFasta.file.prefix + "." + str(os.getpid()) + ".blast.xml.out") else: temp_output_name = blast_output_file self._execute(input_file = newFasta, output_file = temp_output_name) BlastResult = self._parse_blast(newFasta.sequences[0].sequence, temp_output_name) if self.clean_files: self._clean([temp_output_name]) return BlastResult
def get_PDBseq_filtered(self, resolution_threshold, output_file): resolutions = self.get_resolutions() names = [ k for k, v in resolutions.iteritems() if float(v) <= float(resolution_threshold) ] sequences = Fasta(os.path.join(self.PDBseq, 'PDBseq.fa')) selectedseq = sequences.retrieve(copy.deepcopy(names), prefix_size=4) return Fasta.build_multifasta(output_file, selectedseq, True)
def execute_query(self, query_file, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: query_file @pdef: Fasta file with the query sequence. @pdefault: 'QuerySequence' @ptype: {String} or {File} or {Fasta} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: query_file.prefix + job.pid + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if query_file is multi-fasta. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if isinstance(query_file, basestring) or isinstance(query_file, File): newFasta = Fasta(fasta_file = query_file) elif isinstance(query_file, Fasta): newFasta = query_file if newFasta.is_multifasta: msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) # All the sequence is unknown, it will crash blast newFasta.load() query_sequence = newFasta.sequence if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = query_sequence.id, query_sequence = query_sequence.sequence) Path.mkdir(work_directory) file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_output = file_prefixes + ".blast.xml.out" tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) self._execute(input_file = newFasta, output_file = tmp_output) blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output) self._clean([tmp_output, ]) return blast_result
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
def execute_query_seq(self, sequenceID = None, sequence = None, blast_input_file = None, blast_output_file = None): if sequenceID is None and sequence is None: raise AttributeError('Either a sequence or ID is needed to perform the blast') if isinstance(sequenceID, (list,set,tuple)): raise AttributeError('Blasts can only be executed one at a time due to XML output restrictions') if sequenceID is None: sequenceID = 'QuerySequence' #Given only a code implies that the protein of interest is in the database itself if sequence is None: grabbedSequence = self._database.retrieve(sequenceID) sequenceID = grabbedSequence.id sequence = grabbedSequence.sequence if len(re.sub(r'[Xx]','',sequence)) == 0: #All the sequence is unknown, it will crash blast return BR.BlastResult(queryname=sequenceID, querylength=len(sequence)) file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))]) if blast_input_file is None: temp_input_name = os.path.join(os.getcwd(), file_prefixes + ".tmp.fa") else: temp_input_name = blast_input_file if blast_output_file is None: temp_output_name = os.path.join(os.getcwd(), file_prefixes + ".blast.xml.out") else: temp_output_name = blast_output_file QueryFasta = Fasta.build(file_name = temp_input_name, sequenceID = sequenceID, sequence = sequence, force = True) self._execute(input_file = QueryFasta, output_file = temp_output_name) BlastResult = self._parse_blast(sequence, temp_output_name) if self.clean_files: self._clean([temp_input_name,temp_output_name]) return BlastResult
def _check_database(self, database): ''' Ensures that the given database to blast upon exists and that it is formated for blast. I also sets the index file for the database if it exists. @param: database @pdef: database to blast upon. @ptype: {String} @returns: {Fasta} object pointed to the database fasta file. ''' # Database file does not exist if not os.path.isfile(database): return self._error.database_does_not_exist(database) # Database is not formated (if dbformatexe is added in the # configuration path it will be auto-formated) formatdb_files = [] formatdb_sufix = BlastExe._EXEC_TYPES[self._search_type] for sufix in formatdb_sufix: if not os.path.isfile(database + sufix): formatdb_files.append(database + sufix) if len(formatdb_files) > 0: try: self._format_database(database) except ConfigParser.NoOptionError as e: raise self._error.no_blast_format_exe(e) except SystemError as e: raise self._error.wrong_db_format(database, e) idx = os.path.abspath(database) + ".idx" self._idx = idx if os.path.isfile(idx) else None return Fasta(fasta_file = database)
def execute_query_seq(self, sequenceID = None, sequence = None, blast_input_file = None, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: sequenceID @pdef: name of the query sequence. @pdefault: 'QuerySequence' @pclash: If sequence is not provided, it assumes that the sequenceID belongs to a protein in the database and, thus, it searches for it. Either sequenceID or sequence needs to be provided. @ptype: {String} @param: sequence @pdef: query sequence. @pdefault: _None_ @pclash: Either sequenceID or sequence needs to be provided. @ptype: {String} @param: blast_input_file @pdef: name of the temporary fasta file to use as BLAST input. @pdefault: job.pid + clock + .tmp.fa @ptype: {String} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: job.pid + clock + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if neither sequenceID nor sequence are provided or if sequenceID is a list of sequence names. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if sequenceID is None and sequence is None: msg = 'Either a sequence or sequenceID is needed to perform the blast.' raise AttributeError(msg) if isinstance(sequenceID, (list, set, tuple)): msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) sequenceID = 'QuerySequence' if sequenceID is None else sequenceID # Given only a code implies that the protein of interest is in the # database itself if sequence is None: grabbedSequence = self._database.retrieve(sequenceID) sequenceID = grabbedSequence[0].id sequence = grabbedSequence[0].sequence # All the sequence is unknown, it will crash blast if len(re.sub(r'[Xx]', '', sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = sequenceID, query_sequence = sequence) Path.mkdir(work_directory) file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_input = file_prefixes + ".tmp.fa" tmp_output = file_prefixes + ".blast.xml.out" tmp_input = tmp_input if blast_input_file is None else os.path.join(work_directory, blast_input_file) tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) QueryFasta = Fasta.build(file_name = tmp_input, sequence_id = sequenceID, sequence = sequence, force = True) self._execute(input_file = QueryFasta, output_file = tmp_output) blast_result = self._parse_blast(sequence, tmp_output) self._clean([tmp_input, tmp_output]) return blast_result
def execute_query_seq(self, sequenceID=None, sequence=None, hmmer_input_file=None, hmmer_output_file=None, work_directory=os.getcwd()): ''' Execute BLAST given a query sequence. @param: sequenceID @pdef: name of the query sequence. @pdefault: 'QuerySequence' @pclash: If sequence is not provided, it assumes that the sequenceID belongs to a protein in the database and, thus, it searches for it. Either sequenceID or sequence needs to be provided. @ptype: {String} @param: sequence @pdef: query sequence. @pdefault: _None_ @pclash: Either sequenceID or sequence needs to be provided. @ptype: {String} @param: hmmer_input_file @pdef: name of the temporary fasta file to use as HMMER input. @pdefault: job.pid + clock + .tmp.fa @ptype: {String} @param: blast_output_file @pdef: name of the temporary HMMER output file. @pdefault: job.pid + clock + .hmmer.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if neither sequenceID nor sequence are provided or if sequenceID is a list of sequence names. @raises: {HmmError} in HMMER execution or output parsing errors. @returns: {HmmResult} ''' if sequenceID is None or sequence is None: msg = 'Both a sequence and sequenceID are needed to perform the blast.' raise AttributeError(msg) if isinstance(sequenceID, (list, set, tuple)): msg = 'Blasts can only be executed one at a time due to parse restrictions.' raise AttributeError(msg) # All the sequence is unknown, it will crash blast if len(re.sub(r'[Xx]', '', sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return HmmResult(query_name=sequenceID, query_sequence=sequence, database=self.database) file_prefixes = ".".join( [str(os.getpid()), str(int(time.clock() * 100000))]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_input = file_prefixes + ".tmp.fa" tmp_output = file_prefixes + ".blast.xml.out" tmp_input = tmp_input if hmmer_input_file is None else hmmer_input_file tmp_output = tmp_output if hmmer_output_file is None else hmmer_output_file QueryFasta = Fasta.build(file_name=tmp_input, sequence_id=sequenceID, sequence=sequence, force=True) self._execute(input_file=QueryFasta, output_file=tmp_output) hmmer_result = self._parse_hmmer(sequenceID, sequence, tmp_output) self._clean([tmp_input, tmp_output]) return hmmer_result
formatdb_files = [] if self._search_type == 'nucl': formatdb_sufix = ['.nhr','.nin','.nsq'] elif self._search_type == 'prot': formatdb_sufix = ['.phr','.pin','.psq'] for sufix in formatdb_sufix: if not os.path.isfile(database + sufix): formatdb_files.append(database + sufix) if len(formatdb_files) > 0: try: dbexe = Executable(executable = self._configurator.get('blast','dbformatexe'), path = self._configurator.get('blast','path'), variable_path = self._configurator.get('blast','variable_path')) SBIglobals.alert('debug', self, 'Trying to format de DB {0} to perform a blast search.\n'.format(database)) dbexe.add_attribute(database, '-in') dbexe.add_attribute(self._search_type, '-dbtype') SBIglobals.alert('debug', self, 'Executing command {0}\n'.format(dbexe)) dbexe.execute() except ConfigParser.NoOptionError, e: raise BE(code = -6, value = formatdb_files) except SystemError, e: raise BE(code = -11, value = e) return Fasta(fasta_file = database)