def execute_query(self, query_file, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: query_file @pdef: Fasta file with the query sequence. @pdefault: 'QuerySequence' @ptype: {String} or {File} or {Fasta} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: query_file.prefix + job.pid + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if query_file is multi-fasta. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if isinstance(query_file, basestring) or isinstance(query_file, File): newFasta = Fasta(fasta_file = query_file) elif isinstance(query_file, Fasta): newFasta = query_file if newFasta.is_multifasta: msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) # All the sequence is unknown, it will crash blast newFasta.load() query_sequence = newFasta.sequence if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = query_sequence.id, query_sequence = query_sequence.sequence) Path.mkdir(work_directory) file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_output = file_prefixes + ".blast.xml.out" tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) self._execute(input_file = newFasta, output_file = tmp_output) blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output) self._clean([tmp_output, ]) return blast_result
def _add_AAc(self, chain): if self._AAdo and chain.chaintype == 'P': raise NotImplementedError ppi_id = PPInnerContact.test_identifier(chain) SBIglobals.alert('debug', self, 'Analyzing Protein Inner Contacts {0} for {1.chain}'.format(ppi_id, chain)) ppi = PPInnerContact(chain, self._AA_type, self._AA_distance) SBIglobals.alert('debug', self, '\tAdding new Inner Contacts') self._AAcontacts[ppi_id] = ppi
def get_residue_by_identifier(self, identifier): if len(self._chaindict) == 0: for r in self.full_chain: self._chaindict[r.identifier.strip()] = r try: r = self._chaindict[str(identifier.strip())] except KeyError as e: SBIglobals.alert('error', self, '{0} is not a valid identifier for chain {1}'.format(identifier, self.globalID), e) else: return r
def index_file(self, value): ''' @param: value @pdef: name of the related index file @ptype: {String} ''' if os.path.isfile(value): self._index_file = value else: SBIg.warn(self, '{0} does not exist.'.format(value))
def _check_action(self, action): if not action in self.available_action: # Raise 'Wrong action' Error raise File(1, action, self.available_action) if self.is_gziped and not action.endswith('b'): action += 'b' # Better if working with compressed files self._action = action SBIglobals.alert('debug', self, '\tAction {0} is OK...'.format(self._action))
def _build(self): count = 1 SBIglobals.alert('debug', self, 'Analyzing Inner Contacts of {0:03} chains'.format(len(self.pdb))) for chain in self.pdb.chains: SBIglobals.alert('debug', self, 'Analyzing Chain {0:03} out of {1:03}'.format(count, len(self.pdb))) self._add_AAc(chain) self._add_NCc(chain) self._add_HTc(chain) count += 1
def _execute(self): ''' Executes the DSSP call. ''' self._EXE.add_parameter(self._pdbfile) self._EXE.add_parameter(self._dsspfile) try: self._EXE.execute(silent=True) except SystemError, e: msg = 'Some error occurred while executing dssp\n{0}\n'.format(e) SBIg.throw(self, msg, e)
def is_reference_to_db(self, db_minicode): db_minicode = db_minicode.upper() if not db_minicode in self.valid_references: line1 = '{0} is not a valid DBref code'.format(db_minicode) line2 = 'Available codes: {0}'.format(self.valid_references.keys()) SBIglobals.error(self, "\n".join([line1, line2])) #SWS and TREMBL are parts of UNP if db_minicode == 'UNP' and self._db in ['SWS', 'TREMBL']: return True return self._db == db_minicode
def dump(self, object_file, overwrite = None): """ - dump(): Stores the object into a file - object_file (string): Name for the output file @Mandatory - overwrite (bool): Overwrite previous file of the same name @Raises FileError """ SBIglobals.alert('verbose', self, 'Writting object to file {0}'.format(object_file)) dumpFile = File(file_name = object_file, action = 'wb', overwrite = overwrite) pickle.dump(self, dumpFile.descriptor) dumpFile.close()
def _add_HTc(self, chain): if self._HTdo and chain.chaintype == 'P': phi_id = PHInnerContact.test_identifier(chain) SBIglobals.alert( 'debug', self, 'Analyzing Protein-Heteroatom Inner Contacts {0} for {1.chain}' .format(phi_id, chain)) phi = PHInnerContact(chain, self._HT_type, self._HT_distance) SBIglobals.alert('debug', self, '\tAdding new Inner Contacts') self._HTcontacts[phi_id] = phi
def load(object_file): """ > load(): Retrieves the object from a python object file - object_file (string): Name of the file containing the object @Mandatory @Returns the loaded object. @staticmethod: can be called without any instance declared @Raises FileError """ SBIglobals.alert('verbose', StorableObject, 'Preparing to load object from file {0}'.format(object_file)) Object = None loadFile = File(file_name = object_file, action='rb') Object = pickle.load(loadFile.descriptor) loadFile.close() return Object
def _check_file(self): if self._action.startswith('r'): if not os.path.isfile(self.full): raise FileError(3, self.full, 'noexists') if not os.access(self.full, os.R_OK): raise FileError(4, self.full, 'read') if self._action.startswith('w') or self._action.startswith('a'): if os.path.isfile(self.full): if not self._overwrite: raise FileError(3, self.full, 'exists') if not os.path.isdir(self.dir): raise FileError(4, self.dir, 'nodir') if not os.access(self.dir, os.W_OK): raise FileError(4, self.dir, 'write') SBIglobals.alert('debug', self, '\tFile is OK...')
def __init__(self, file_name=None, action='r', overwrite=None): if file_name is None: raise FileError(0) # Raise 'No file specified' Error self._file = file_name SBIglobals.alert('debug', self, 'Preparing File: {0}'.format(self.full)) self._action = None self._check_action(action.lower()) #action must be valid self._fd = None # Local overwrite takes precedence over Global overwrite self._overwrite = SBIglobals.decide_overwrite(overwrite) self._check_file()
def __init__(self, database, overwrite=None, clean=True): ''' @param: database @pdef: database to blast upon. @ptype: {String} @param: overwrite @pdef: For writing actions. Decides whether it can overwrite an existing file. @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @param: clean @pdef: remove the temporary files after the data is read. @pdefault: _True_ @pclash: if _SBIglobals.debug_ is _True_, clean is _False_ @ptype: {Boolean} @raises: {HmmError} ''' self._error = HmmError() #hmmer executable configuration if HmmExe._EXE is None: self._set_default_executable('hmmer') HmmExe._DBFORMATER = HmmExe._CONFIG.get('hmmer', 'dbformatexe') # Local overwrite takes precedence over Global overwrite self._overwrite = SBIg.decide_overwrite(overwrite) self._database = self._check_database(os.path.abspath(database)) self._clean_files = clean #Optional execution parameters self._parameters = {'attr': [], 'flag': []}
def items(self): ''' Loops through the items of the database @yields: Object depending on the database. ''' if not self.has_local: SBIg.throw(self, 'A local database needs to be build first', IOError) for ifile in self._ITEM_FILES: ifile = os.path.join(self.local, ifile) f = File(ifile) for line in f.read(): yield self._DBOBJECT.grab(line.strip()) f.close()
def __init__(self, pdb_line): ''' @type pdb_line: String @param pdb_pdb_line: Line of a PDB. Starts with DBREF @raise AttributeError if line does not start with DBREF ''' if not pdb_line.startswith('DBREF '): SBIglobals.error(self, '{0} cannot create DBref'.format(pdb_line)) data = self._process_line(pdb_line) self._pdb = data[0] self._chain = data[1] self._start = data[2] self._end = data[3] self._db = data[4] self._ref = data[5]
def create(self): ''' Create the local database. Returns _True_ on success, _False_ otherwise @return: {Boolean} ''' if self.has_local: SBIg.warn(self, 'A local copy exist. Executing update.') return self.update() self._download_sources() self._process() self._save_release() # self._clean_sources() return True
def retrieve(self, sequence_ids, all_but=False, prefix_size=None): ''' Get specific sequences from the FASTA file. @param: sequence_ids @pdef: sequence identifier(s) @ptype: {String}, {List} or {Set} @param: all_but @pdef: Flag. Instead of retrieving the given ids, we retrieve all except the given ids. @pdefault: _False_ @ptype: {Boolean} @param: prefix_size @pdef: maximum characters for the prefix. If _None_, all the characters are included. @pdefault: _None_ @ptype: {Integer} @raises: {AttributeError} if sequence_ids is not a valid type. @return: {List} of {Sequence} ''' info = 'Skipping sequence {0}' if all_but else 'Getting sequence {0}' if not isinstance(sequence_ids, (list, set)): SBIg.alert('debug', self, info.format(sequence_ids)) else: SBIg.alert('debug', self, [info.format(x) for x in sequence_ids]) if isinstance(sequence_ids, basestring): sequence_ids = set([sequence_ids]) if isinstance(sequence_ids, list): sequence_ids = set(sequence_ids) if isinstance(sequence_ids, set): sequences = [] for s in self.live_show(): seq_id = s.id if prefix_size is None else s.id[:prefix_size] if seq_id in sequence_ids and not all_but: sequences.append(s) if seq_id not in sequence_ids and all_but: sequences.append(s) return sequences else: raise AttributeError('sequence_ids must be a string, list or set.')
def _format_database(self, database): ''' Executes the blast script to format the database. @param: database @pdef: database to blast upon. @ptype: {String} ''' SBIg.warn(self, 'Formating {0} for blast.'.format(database)) dbexe = Executable(executable = BlastExe._DBFORMATER, path = self._EXE.path) dbexe.add_attribute(database, '-in') dbexe.add_attribute(self._search_type[0:4], '-dbtype') SBIg.alert('debug', self, 'Executing command {0}\n'.format(dbexe)) dbexe.execute()
def _build(self): if len(self.protein.aminoacids) == 0 or len( self.nucleotide.nucleotides) == 0: return super(PNInterface, self)._build() for i in range(len(self._filtered[0])): SBIglobals.alert( 'deepdebug', self, 'Analyze AN Contact for {0.type}:{0.number} - {1.type}:{1.number}' .format(self.protein.aminoacids[self._filtered[0][i]], self.nucleotide.nucleotides[self._filtered[1][i]])) new_contact = ContactAN( aminoacid=self.protein.aminoacids[self._filtered[0][i]], nucleotide=self.nucleotide.nucleotides[self._filtered[1][i]], threshold_type=self.threshold_type, threshold_distance=self.threshold_distance) if new_contact.is_underthreshold: self.contacts = new_contact
def list_directories(root=os.curdir, rootless=False): """ > list_directories(): Returns all dirs in a directory tree - root (string): Root of the directory tree to search @Default: current working directory - rootless (bool) : When False, the name of the dirs are returned with absolute path. Otherwise the root is removed. @Default: False @Yields directory names """ for path, dirs, files in os.walk(os.path.abspath(root)): for onedir in dirs: SBIglobals.alert( 'debug', Path(), 'Found directory {0}'.format(os.path.join(path, onedir))) if not rootless: yield os.path.join(path, onedir) else: yield os.path.join(path, onedir).replace(root, '')
def _process(self, update=False): ''' Transform the source files into the final local db files. @param: update @pdef: toggles between create and update processing @pdefault: _False_ @ptype: {Boolean ''' if update: old = self._RELEASE['total_items'].copy() j = 0 for i in range(len(self._SOURCES)): dfilen = os.path.join(self.local, self._SOURCES[i]) ofilen = os.path.join(self.local, self._MANDATORY_FILES[j]) ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1]) if not os.path.isfile(dfilen): continue SBIg.alert('verbose', self, 'Parsing: {0}'.format(dfilen)) SBIg.alert('verbose', self, 'DB file to: {0}'.format(ofilen)) SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen)) dfile = File(dfilen) ofile = File(ofilen, 'w', update) ffile = File(ffilen, 'w', update) protein = None for protein in Connect._parse_uniprot(dfile): pname = protein.entry_name pvers = protein.version SBIg.alert('verbose', self, 'Protein: {0}'.format(pname)) if not update: self._RELEASE['total_items'][pname] = pvers else: if pname not in self._RELEASE['total_items']: self._RELEASE['new_items'][pname] = pvers else: del (old[pname]) if self._RELEASE['total_items'][pname] != pvers: self._RELEASE['update_items'][pname] = pvers ffile.write(protein.sequence.format('FASTA') + '\n') ofile.write(protein.json() + '\n') j += 2 dfile.close() ofile.close() ffile.close() if update: self._RELEASE['total_items'].update(self._RELEASE['new_items']) self._RELEASE['total_items'].update(self._RELEASE['update_items']) self._RELEASE['deleted_items'] = old for k in self._RELEASE['deleted_items']: del (self._RELEASE['total_items'][k])
def _format_database(self, database): ''' Executes the hmmer script to format the database. @param: database @pdef: database to blast upon. @ptype: {String} ''' SBIg.warn(self, 'Formating {0} for hmmer.'.format(database)) dbexe = Executable(executable=HmmExe._DBFORMATER, path=self._EXE.path) if self.overwrite: dbexe.add_parameter('-f') dbexe.add_parameter(database) SBIg.alert('debug', self, 'Executing command {0}\n'.format(dbexe)) dbexe.execute()
def reposition(self, matrix=None, vector=None): """ Rotates and translates the {Residue} according to a matrix and translational vector @type matrix: numpy.matrix @type vector: numpy.array """ if matrix is None: matrix = np.identity(3, float) if vector is None: vector = np.zeros(3, float) SBIglobals.alert('deepdebug', self, 'Reposition residue {0.type}:{0.number}'.format(self)) self._backbone_coordinates = None self._sidechain_coordinates = None for atom in self.atoms: SBIglobals.alert('deepdebug', self, 'Atom {0.name} {0.is_backbone}'.format(atom)) atom.rotate(matrix=matrix) atom.translate(vector=vector) self._add_to_matrix(atom)
def __init__(self, pdb_file=None): SBIglobals.alert('debug', self, 'Loading PDB file {0}'.format(pdb_file)) super(PDB, self).__init__(pdb_file=pdb_file, dehydrate=True, header=True) SBIglobals.alert( 'debug', self, 'Calculating Inner Contacts for Protein - Heteroatom') self.innercontacts = InnerContacts(pdb=self, AA=False, NC=False, HT=True, HT_type="min", HT_distance=6) SBIglobals.alert( 'debug', self, 'Calculating PP and PN interfaces of the biomolecules in the PDB') self.interfaces = Complex(pdb=self, biomolecule=True, PPI=True, PPI_type="cb", PPI_distance=12, PNI=True, PNI_type="min", PNI_distance=8, PHI=True, PHI_type="min", PHI_distance=6)
def sortarchs(inputdir, outputdir): archsdir = outputdir Path.mkdir(archsdir) sorted_archs = {} loop_file_name = os.path.join(archsdir, 'ArchDB.{0}.db') loop_split_file_name = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db') sections_ini = [ 0, 4, 7,14,21] sections_end = [ 4, 6,13,20, 0] for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'): filename = os.path.basename(archfile) data = filename.split('_') length = int(data[0]) archtype = data[1] sorted_archs.setdefault(archtype,{}).setdefault(length,[]) sorted_archs[archtype][length].append(archfile) for archtype in sorted_archs: SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n") fd = File(loop_file_name.format(archtype), 'w') fdp = [] for x in range(len(sections_ini)): fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w')) for length in sorted(sorted_archs[archtype]): SBIglobals.alert('verbose', None, '\t{0}'.format(length)) for archfile in sorted_archs[archtype][length]: SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile)) nsp = Arch.load(archfile) fd.descriptor.write(nsp.archtype_format() + "\n") for x in range(len(fdp)): if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]): fdp[x].descriptor.write(nsp.archtype_format() + "\n") fd.close() for x in range(len(fdp)): fdp[x].close()
def _download_sources(self): ''' Download the source files to local directory. ''' for dfile in self._SOURCES: download = False source = os.path.join(self._FTP, dfile) source_size = DBlink._file_size(source) # source_date = DBlink._file_date(source) DBlink._SOURCES_SIZES.append(source_size) destination = os.path.join(self.local, dfile) if not os.path.isfile(destination): if DBlink._RELEASE['date'] != DBlink._TODAY: # if source_date < DBlink._RELEASE: # SBIg.alert('verbose', self, 'No new updates in the source side' + # ' for {0}'.format(source) + # ' since the last local update.') # else: download = True else: download = True else: SBIg.alert( 'verbose', self, 'Looks like {0} has already been downloaded.'.format( source)) if download: SBIg.alert( 'verbose', self, 'Downloading {0} to {1}'.format(source, destination)) SBIg.alert( 'verbose', self, 'Source file size is {0:.3f} MB.'.format(source_size)) urllib.urlretrieve(source, destination)
def list_files(root=os.curdir, pattern='*', avoid_empty_files=True, rootless=False): """ > list_files() : Returns any file in a directory tree matching a specific pattern - root (string): Root of the directory tree to search @Default: current working directory - pattern (string): Expression to match (ls-like format) (Accepts list of strings) @Default: * - avoid_empty_files (bool) : Ignore files with size 0 @Default: True - rootless (bool) : When False, the name of the files are returned with absolute path. Otherwise the root is removed. @Default: False @Yields file names """ if os.path.isfile(root): yield root search_patterns = [] if not isinstance(pattern, list): search_patterns.append(pattern) else: search_patterns = pattern for pat in search_patterns: for path, dirs, files in os.walk(os.path.abspath(root)): for filename in fnmatch.filter(files, pat): if not avoid_empty_files or os.path.getsize( os.path.join(path, filename)) > 0: SBIglobals.alert( 'debug', Path(), 'Found file {0}'.format( os.path.join(path, filename))) if not rootless: yield os.path.join(path, filename) else: root = os.path.abspath(root) + "/" yield os.path.join(path, filename).replace(root, '')
def _build(self): for dist_type in self.available_distance_types: self._distance.setdefault(dist_type, None) self._distance[self._threshold_type] = self.aminoacid.distance(self.nucleotide, dist_type = self._threshold_type) SBIglobals.alert('deepdebug', self, '\tEvaluating distance {0:.3f} of {1}'.format(self._distance[self._threshold_type][2], self._threshold_type)) if float(self._distance[self._threshold_type][2]) <= self._threshold_distance and \ float(self._distance[self._threshold_type][2]) >= 0: SBIglobals.alert('deepdebug', self, '\tDistance under threshold.') self._underthreshold = True for dist_type in self._distance: if dist_type != self._threshold_type: SBIglobals.alert('deepdebug', self, '\tGathering {0} distance'.format(dist_type)) self._distance[dist_type] = self.aminoacid.distance(self.nucleotide, dist_type = dist_type) SBIglobals.alert('deepdebug', self, '\t\tDistance {0:.3f} of {1}'.format(self._distance[dist_type][2], dist_type))
def reduce(self, new_fasta_file, list_file, force=None): ''' Reduces the {Fasta} by removing identical sequences. @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: list_file @pdef: name of the repetition list file @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} and {File} with the list of identical sequences. ''' seq_md5 = {} sequences = [] for seq in self.live_show(): md5 = seq.md5 if not md5 in seq_md5: sequences.append(seq) seq_md5.setdefault(md5, []) else: SBIg.alert( 'debug', self, '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0])) seq_md5[md5].append(seq.id) fasta = Fasta.build_multifasta(new_fasta_file, sequences, force) listfile = File(list_file, 'w') for md5 in seq_md5: listfile.write('\t'.join(seq_md5[md5]) + '\n') listfile.close() return fasta, listfile
def write(self, output_file=None, format='PDB', force=None, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File( file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force)) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean)