def test_no_protein_support(self): """Testing no protein support for embl""" # TODO: add protein support # a fake protein line. handle = io.StringIO('ID M14399; SV 1; linear; mRNA; STD; ' 'PRO; 63 AA.\n//\n') with self.assertRaisesRegex(EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a protein record Protein.read(handle) # return to 0 handle.seek(0) with self.assertRaisesRegex(EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a generic record skbio.io.read(handle, format='embl')
def test_no_protein_support(self): """Testing no protein support for embl""" # TODO: add protein support # a fake protein line. handle = io.StringIO('ID M14399; SV 1; linear; mRNA; STD; ' 'PRO; 63 AA.\n//\n') with self.assertRaisesRegex( EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a protein record Protein.read(handle) # return to 0 handle.seek(0) with self.assertRaisesRegex( EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a generic record skbio.io.read(handle, format='embl')
def mask_sequence(hhsuite_fp, fullsequence_fp, subsequences_fp=None, min_prob=None, max_pvalue=None, max_evalue=None, min_fragment_length=0, min_identity=0): """ Splits a protein sequence according to HHsuits results. The returned sub-sequences will seamlessly build the full sequence if re-concatenated. Parameters ---------- hhsuite_fp : str Filepath to HHblits/HHsearch output. fullsequence_fp : str Filepath to the protein sequence of the original query. subsequences_fp : str Filepath to which sub-sequences are written as a multiple fasta file. Each sequence makes up one header and one sequence file, i.e. sequences are not wrapped. Two files will be produced, suffixed by '.match' and '.non_match'. The first holds sub-sequences of hits, the second holds the none-hit covered subsequences. Default: None, i.e. no file is written. min_prob: float Minimal probability of a hit to be included in the resulting list. Note: probabilities are in the range of 100.0 to 0.0. Default: None, i.e. no filtering on probability. max_pvalue: float Maximal P-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on P-value. max_evalue: float Maximal E-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on E-value. min_fragment_length: int Minimal fragment length of a hit to be included in the resulting list. Default: 0, i.e. no filtering on fragment length. min_identity: float Minimum pair-wise sequence identity of a hit to be included in the resulting list. Default: 0, i.e. no filtering on sequence identity. Returns ------- [(str, str)] where first component is a fasta header, while the second is its fasta sequence. Raises ------ IOError If the file cannot be written. Notes ----- A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue, min_fragment_length) to be included in the resulting list. """ # parse hits from file hits = parse_pdb_match(hhsuite_fp) # filter hits if min_prob is not None: hits = [hit for hit in hits if hit['Probab'] >= min_prob] if max_pvalue is not None: hits = [hit for hit in hits if hit['P-value'] <= max_pvalue] if max_evalue is not None: hits = [hit for hit in hits if hit['E-value'] <= max_evalue] if min_fragment_length is not None: hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length] if min_identity is not None: hits = [hit for hit in hits if hit['Identities'] >= min_identity] # read the original protein file, used to run HHsearch p = Protein.read(fullsequence_fp, seq_num=1) query_id = p.metadata['id'] query_desc = p.metadata['description'] results = {'match': [], 'non_match': []} # select non overlapping positive hits subseqs_pos = select_hits(hits, e_value_threshold=999999) for hit in subseqs_pos: _id = get_q_id(hit) match_id = hit['Hit'].split()[0] header = "%s %s %s" % (correct_header_positions( query_id, hit['alignment'][_id]['start'], hit['alignment'][_id]['end']), '# %s' % match_id, query_desc) seq = hit['alignment'][_id]['sequence'].replace('-', '') results['match'].append((header, seq, hit['alignment'][_id]['start'])) # collect gaps between positive hits subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p), min_fragment_length) for hit in subseqs_neg: header = "%s %s" % (correct_header_positions(query_id, hit['start'], hit['end']), query_desc) seq = hit['sequence'] results['non_match'].append((header, seq, hit['start'])) # write sub-sequences to a multiple fasta file, sequences are un-wrapped try: # sort by start position for type_ in results: results[type_] = sorted(results[type_], key=lambda x: x[2]) if subsequences_fp is not None: for type_ in results: f = open('%s.%s' % (subsequences_fp, type_), 'w') for res in results[type_]: f.write(">%s\n%s\n" % res[:2]) f.close() # removing the start position component from all subsequences return { type_: list(map(lambda x: x[:2], results[type_])) for type_ in results } except IOError: raise IOError('Cannot write to file "%s"' % subsequences_fp)
def mask_sequence(hhsuite_fp, fullsequence_fp, subsequences_fp=None, min_prob=None, max_pvalue=None, max_evalue=None, min_fragment_length=0): """ Splits a protein sequence according to HHsuits results. The returned sub-sequences will seamlessly build the full sequence if re-concatenated. Parameters ---------- hhsuite_fp : str Filepath to HHblits/HHsearch output. fullsequence_fp : str Filepath to the protein sequence of the original query. subsequences_fp : str Filepath to which sub-sequences are written as a multiple fasta file. Each sequence makes up one header and one sequence file, i.e. sequences are not wrapped. Two files will be produced, suffixed by '.match' and '.non_match'. The first holds sub-sequences of hits, the second holds the none-hit covered subsequences. Default: None, i.e. no file is written. min_prob: float Minimal probability of a hit to be included in the resulting list. Note: probabilities are in the range of 100.0 to 0.0. Default: None, i.e. no filtering on probability. max_pvalue: float Maximal P-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on P-value. max_evalue: float Maximal E-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on E-value. min_fragment_length: int Minimal fragment length of a hit to be included in the resulting list. Default: 0, i.e. no filtering on fragment length. Returns ------- [(str, str)] where first component is a fasta header, while the second is its fasta sequence. Raises ------ IOError If the file cannot be written. Notes ----- A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue, min_fragment_length) to be included in the resulting list. """ # parse hits from file hits = parse_pdb_match(hhsuite_fp) # filter hits if min_prob is not None: hits = [hit for hit in hits if hit['Probab'] >= min_prob] if max_pvalue is not None: hits = [hit for hit in hits if hit['P-value'] <= max_pvalue] if max_evalue is not None: hits = [hit for hit in hits if hit['E-value'] <= max_evalue] if min_fragment_length is not None: hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length] # read the original protein file, used to run HHsearch p = Protein.read(fullsequence_fp, seq_num=1) query_id = p.metadata['id'] query_desc = p.metadata['description'] results = {'match': [], 'non_match': []} # select non overlapping positive hits subseqs_pos = select_hits(hits, e_value_threshold=999999) for hit in subseqs_pos: _id = get_q_id(hit) match_id = hit['Hit'].split()[0] header = "%s %s %s" % (correct_header_positions( query_id, hit['alignment'][_id]['start'], hit['alignment'][_id]['end']), '# %s' % match_id, query_desc) seq = hit['alignment'][_id]['sequence'].replace('-', '') results['match'].append((header, seq, hit['alignment'][_id]['start'])) # collect gaps between positive hits subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p), min_fragment_length) for hit in subseqs_neg: header = "%s %s" % (correct_header_positions( query_id, hit['start'], hit['end']), query_desc) seq = hit['sequence'] results['non_match'].append((header, seq, hit['start'])) # write sub-sequences to a multiple fasta file, sequences are un-wrapped try: # sort by start position for type_ in results: results[type_] = sorted(results[type_], key=lambda x: x[2]) if subsequences_fp is not None: for type_ in results: f = open('%s.%s' % (subsequences_fp, type_), 'w') for res in results[type_]: f.write(">%s\n%s\n" % res[:2]) f.close() # removing the start position component from all subsequences return {type_: list(map(lambda x: x[:2], results[type_])) for type_ in results} except IOError: raise IOError('Cannot write to file "%s"' % subsequences_fp)