def _parse(self): file_fd = File(self._dsspfile) read = False continuity = -1000 readline = 0 for line in file_fd.read(): if line.startswith(" # RESIDUE AA STRUCTURE BP1 BP2 ACC"): read = True continue if read: if line[13:14] != '!': res_num = int(line[6:10].strip()) ss = line[16:17] if line[16:17] != ' ' else '-' buried = int(line[35:38].strip()) aa = line[13:15].strip() self._dsspdata.append(DSSP(secondary_structure = ss, accessibility = buried, amino = aa)) self._dsspdata[-1].add_hydrogen_links(line[39:50], line[50:61], line[61:72], line[72:84]) if readline > 0: if res_num != continuity + 1: self._gapped = True continuity = res_num readline += 1 else: msg = "truncated chain!{0}\n".format(self._dsspfile) sys.stderr.write(msg) SBIg.warn(self, msg) self._gapped = True file_fd.close()
def subset(self, sequence_ids, new_fasta_file, all_but=False, prefix_size=None, index=False, force=None): ''' Creates a new {Fasta} with the requested subset of sequences. @param: sequence_ids @pdef: sequence identifier(s) @ptype: {String}, {List} or {Set} @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: all_but @pdef: Flag. Instead of retrieving the given ids, we retrieve all except the given ids. @pdefault: _False_ @ptype: {Boolean} @param: prefix_size @pdef: maximum characters for the prefix. If _None_, all the characters are included. @pdefault: _None_ @ptype: {Integer} @param: index @pdef: create the index file also, in case it does exist @pdefault: _False_ @ptype: {Boolean} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @raises: {AttributeError} if sequence_ids is not a valid type. @return: {Fasta} ''' sequences = self.retrieve(sequence_ids, all_but, prefix_size) fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force) if self.has_index and index: idxfile = File(self.index_file) newidx = File(fasta_file.file.full + '.idx', 'w') seqids = set(fasta_file.sequence_identifiers) for idx in idxfile.read(): if idx.split()[0].strip('>') in seqids: newidx.write(idx) idxfile.close() newidx.close() fasta_file.index_file = newidx.full return fasta_file
def read_compacted_blast(compacted_blast_file): ''' Read data from a printed compacted blast into {BlastResult}. Not all options will be available in that new object. @param: compacted_blast_file @pdef: file of the compacted blast print @ptype: {String} @return: {BlastResult} ''' from BlastHit import BlastHit query_name, query_sequence = None, None version, matrix, database = None, None, None gap_open, gap_extend, self_hit = None, None, None br = None cbf = File(compacted_blast_file) for line in cbf.read(): if line.startswith('#'): if line.startswith('#Query:'): query_name = line.strip().split()[-1] if line.startswith('#Query Sequence:'): query_sequence = line.strip().split()[-1] if line.startswith('#Blast Version:'): version = line.strip().split()[-1] if line.startswith('#Search on matrix:'): matrix = line.strip().split()[-1] if line.startswith('#Gap open penalty:'): gap_open = line.strip().split()[-1] if line.startswith('#Gap extension penalty:'): gap_extend = line.strip().split()[-1] if line.startswith('#Database searched:'): database = line.strip().split()[-1] if line.startswith('#Self Hit is omitted:'): self_hit = line.strip().split()[-1] else: if br is None: if version is None: bh = None else: bh = BlastHeader(version, matrix, gap_open, gap_extend, database, self_hit) br = BlastResult(query_name, query_sequence, bh) d = line.strip().split() hit = BlastHit( [d[2], d[3]], [d[8], d[9]], [int(x) for x in d[10].split(',')[0].split(':')], 1, [d[4], d[5], d[6], d[7]]) br.add_hit(hit) cbf.close() return br
def release(self): ''' Retrieves release data for the database. Not according to the DB release, but to when we downloaded it. @returns: {Dictionary} ''' if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)): f = File(os.path.join(self.local, self._CONTROL_FILE)) data = json.loads(f.read()) f.close() else: data = self._RELEASE return data
def items(self): ''' Loops through the items of the database @yields: Object depending on the database. ''' if not self.has_local: SBIg.throw(self, 'A local database needs to be build first', IOError) for ifile in self._ITEM_FILES: ifile = os.path.join(self.local, ifile) f = File(ifile) for line in f.read(): yield self._DBOBJECT.grab(line.strip()) f.close()
def correct_hit_count(self, count_hit_file=None, count_query_file=None, return_correction_dict=False): ''' Corrects the starting point of the hits and the query, if needed. Why? When blasting vs. PDB (for example), sometimes the hit positions given by blast are wrong, as the blast always consider the first position of the hit sequence as 1 and PDB does not. Even more, the position reference doesn't even need to be a number. As the specific location in the PDB is important, we need to adapt our blasts so than we can read that data. Keep in mind that hits and query must be corrected together in this step, as this function cannot be called twice for a same instance. @param: count_hit_file @pdef: file containing the idex data for the query database each sequence in this file will have a format such as: >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ... @ptype: {String} @param: count_query_file @pdef: sometimes we might also need to correct the query (if PDB vs. PDB). Same format as count_hit_file. They might be the same file. @ptype: {String} @param: return_correction_dict @pdef: instead of actually executing the correction, it only returns the dictionary for further use. @pdefault: _False_ @ptype: {Boolean} @raises: {IOError} if the correction index file does not exist. @raises: {AttributeError} if the BlastResult does not contain any BlastHit. @raises: {BlastError} if it has been called before for this instance. ''' if not self.has_hits: SBIg.warn( self, "BlastResult of {0} has no hits to correct".format(self.query)) return if self.are_hits_corrected: be = BlastExe.BlastError() raise be.corrected_hits() SBIg.alert('debug', self, 'Correcting indexes for {0}'.format(self.query)) cfile = File(count_hit_file) cq = False codes_of_interest = set([hit.sequenceID for hit in self.raw_hits]) if count_query_file == count_hit_file: codes_of_interest.add(self.query) count_query_file = None cq = True start_index_dic = {} for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') in codes_of_interest: start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';') cfile.close() if count_query_file is not None: cfile = File(count_query_file) for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') == self.query: start_index_dic[k[0].lstrip('>')] = k[1].strip().split( ';') cfile.read().close() cq = True if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) self._query_index = start_index_dic[self.query] if return_correction_dict: return start_index_dic for hit in self._hits: # This tests between the options PDB/PDB_ID or PDB_ID in case # the TAB file has different codification h = hit.sequenceID hit_ID = h if h in start_index_dic else h.split("/")[-1] SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID)) hit.correct_hit_count(new_index=start_index_dic[hit_ID]) if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) hit.correct_query_count(new_index=start_index_dic[self.query]) self._correctedHits = True
class CDhitList(StorableObject): ''' List of cd-hit clusters. ''' def __init__(self, cdhit_file=None): ''' @param: cdhit_file @pdef: name of the cd-hit output file @pdefault: _None_. Create an empty list @ptype: {String} ''' self._clusters = [] self._allseqids = {} if cdhit_file is not None: self._file = File(file_name=cdhit_file) else: self._file = None if self._file is not None: self._parse_file() ############## # ATTRIBUTES # ############## @property def clusters(self): ''' List of cd-hit clusters. @return: {List} of {CDhit} ''' return self._clusters ########### # METHODS # ########### def get_cluster4sequence(self, sequence): ''' Retrieve a cluster for a given sequence. _None_ if the sequence is not found. @param: sequence @pdef: name of the query sequence @ptype: {String} @return: {CDhit} ''' if sequence in self._allseqids: return self._clusters[self._allseqids[sequence]] else: return None def is_in_cluster(self, sequence): ''' Evaluate if the sequence is in a cluster. @param: sequence @pdef: name of the query sequence @ptype: {String} @return: {String} as 'N' if no, 'H' if yes and 'M' if cluster master ''' c = self.get_cluster4sequence(sequence) if c is None: return 'N' else: return 'M' if c.is_master(sequence) else 'H' def add_cluster(self, cluster): ''' Add a cd-hit cluster to the object. @param: cluster @pdef: new cd-hit cluster to add @ptype: {CDhit} ''' self._clusters.append(cluster) def add_sequence2cluster(self, sequence, cluster_id=None): ''' Add a new sequence to a given cluster. @param: sequence @pdef: name of the query sequence @ptype: {String} @param: cluster_id @pdef: identifier of the cluster @pdefault: _None_. Refers to the last added cluster. @ptype: {String} ''' if cluster_id is None: self.clusters[-1].add_sequence(sequence) self._allseqids[sequence.name] = len(self.clusters) - 1 else: for x in range(len(self._clusters)): if self._clusters[x].identifier == cluster_id: self._clusters[x].add_sequence(sequence) self._allseqids[sequence.name] = x break def dictionary_role_summary(self): ''' Creates a dictionary separating master sequences and homolog sequences. @return: {Dictionary} ''' data = {'master': [], 'homolog': []} for c in self.clusters: data['master'].append(c.master.name) for s in c.sequences: data['homolog'].append(s) return data def merge_clusters(self, cluster_file): ''' When using an intermediate state to cluster by homology, the result of the second clustering is a clustering of clusters. We need to transform this into the original sequences @param: cluster_file @pdef: name of the second-step cluster output @ptype: {String} ''' clustlist = CDhitList(cluster_file) newlist = CDhitList() cluster_re = re.compile('Cluster\s+(\d+)') for cl in clustlist.clusters: c = CDhit(cluster_id=cl.identifier) newlist.add_cluster(c) cnum = int(cluster_re.search(cl.master.name).group(1)) oldclust = self.clusters[cnum] newlist.add_sequence2cluster(sequence=oldclust.master) for s in oldclust.sequences: newlist.add_sequence2cluster(sequence=oldclust.sequences[s]) for s in cl.sequences: idclust = cl.sequences[s] cnum = int(cluster_re.search(idclust.name).group(1)) oldclust = self.clusters[cnum] master = oldclust.master master.homology = idclust.homology newlist.add_sequence2cluster(sequence=master) for s in oldclust.sequences: h = oldclust.sequences[s] h.homology = int(h.homology * float(idclust.homology) / 10) newlist.add_sequence2cluster(sequence=h) self._clusters = newlist._clusters self._allseqids = newlist._allseqids ################### # PRIVATE METHODS # ################### def _parse_file(self): ''' Read the cd-hit output file into a {CDhitList} ''' homolog_re = re.compile('(\d+)aa,\s+\>([\s\w]+)\.{3}') for line in self._file.read(): if line.startswith('>'): c = CDhit(cluster_id=line.split()[-1].strip()) self.add_cluster(c) else: data = homolog_re.search(line) d = line.split() h = CDhitHomolog(name=data.group(2), length=data.group(1), homology=d[-1]) self.add_sequence2cluster(sequence=h) self._file.close() def __len__(self): return len(self._clusters) def __repr__(self): text = [] for c in self.clusters: text.append('{0}'.format(c)) return '\n'.join(text)
def parse(query_sequence, blast_output_file, self_hit, hitid_format): ''' Processes a blast xml formated output into a {BlastResult} object. @param: query_sequence @pdef: sequence of the query protein/nucleotide. @ptype: {String} @param: blast_output_file @pdef: output file from BLAST. @ptype: {String} @param: self_hit @pdef: when _True_ if the query is found in the database, it is retrieved. @pdefault: _False_ @ptype: {Boolean} @param: hitid_format @pdef: format of the name of the hit. If given a wrong option, it defaults to 'single' @pdefault: 'single' @poptions: 'single' -> first word of the name, 'double' -> first two words of the hit name, 'all' -> all the text in the hit name @ptype: {String} @raises: {BlastError} if there are problems while parsing the XML file. @returns: {BlastResult} ''' f = File(blast_output_file) s = BeautifulSoup(f.read()) h = BlastHeader(version = str(s.find('blastoutput_version').string), matrix = str(s.find('parameters_matrix').string), gap_open = int(s.find('parameters_gap-open').string), gap_extend = int(s.find('parameters_gap-extend').string), database = str(s.find('blastoutput_db').string), self_hit = self_hit) b = BlastResult(query_name = str(s.find('blastoutput_query-def').string), query_sequence = query_sequence, header = h) SBIg.alert('debug', BlastParser(), b.str_blast_header()) error_bool = False error_str = [] for iteration in s.find_all('iteration'): iternum = int(iteration.find('iteration_iter-num').string) for hit in iteration.find_all('hit'): hit_name = BlastParser.hit_name(str(hit.find('hit_def').string), hitid_format) hit_lenth = int(hit.find("hit_len").string) for subhit in hit.find_all("hsp"): data = BlastParser.parse_subhit(subhit) r = BlastHit(hit = [hit_name, hit_lenth], sequences = [data['qs'], data['hs'], data['sc']], sequence_inits = [data['qp'], data['hp']], iteration = iternum, stats = [data['hi'], data['h+'], data['hg'], data['ev']]) if not BlastParser.same_query_hit_names(b.query, hit_name, self_hit): dbug_info = 'Added hit {0} in iteration {1}' SBIg.alert('debug', BlastParser(), dbug_info.format(hit_name, r.iteration)) b.add_hit(r) if not r.are_segments_ok: error_bool = True error_str.append("Check the alignment's fragmentation") error_str.append("for the query %s with %s\n".format(b.query, hit_name)) error_str.append("{0}\n".format(r)) b.set_last_iteration() if error_bool: SBIg.warn(BlastParser(), error_str) be = BlastError() raise be.parse_error() return b