def create_target_similarity_network_normalised(target_seq, name): import math target_names = list(target_seq.keys()) seq_info = [] total = len(target_names) for i in tqdm(range(len(target_names))): seq1 = target_seq[target_names[i]] #print(seq1) #raise Exception('stop') for j in range(i + 1, len(target_names)): #print(i,' ',j) seq2 = target_seqs[target_names[j]] try: alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_ssw( Protein(seq1), Protein(seq2), substitution_matrix=blosum50) except: score = 0 #alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_protein(Protein(p1_s), Protein(p2_s)) new_score = float(score) / (math.sqrt(len(seq1)) * math.sqrt(len(seq2))) seq_info.append([target_names[i], target_names[j], new_score]) #t2 = time.time() #print(t2-t1) #print(seq_info[0:10]) name = 'data/' + name + 'target_similarity.pkl' utils.save_any_obj_pkl(seq_info, name)
def test_to_dict_non_empty(self): seqs = [ Protein('PAW', metadata={'id': 42}), Protein('WAP', metadata={'id': -999}) ] msa = TabularMSA(seqs, key='id') self.assertEqual(msa.to_dict(), {42: seqs[0], -999: seqs[1]})
def test_translate_start_with_start_codon(self): # trim before start codon, replace with M. ensure alternative start # codons following the start codon aren't replaced with M. ensure # default behavior for handling stop codons is retained seq = RNA('CAUUUGCUGAAAUGA') exp = Protein('MLK*') for start in {'require', 'optional'}: obs = self.sgc.translate(seq, start=start) self.assertEqual(obs, exp) # ignore start codon replacement and trimming; just translate exp = Protein('HLLK*') obs = self.sgc.translate(seq, start='ignore') self.assertEqual(obs, exp) # just a start codon, no replacement necessary seq = RNA('AUG') exp = Protein('M') for start in {'require', 'optional', 'ignore'}: obs = self.sgc.translate(seq, start=start) self.assertEqual(obs, exp) # single alternative start codon seq = RNA('CUG') exp = Protein('M') for start in {'require', 'optional'}: obs = self.sgc.translate(seq, start=start) self.assertEqual(obs, exp) exp = Protein('L') obs = self.sgc.translate(seq, start='ignore') self.assertEqual(obs, exp)
def proteinAlign(seq1, seq2, gap_open_penalty, gap_extend_penalty, local=False): seq1 = seq1.upper() seq2 = seq2.upper() if local: aln, score, _ = local_pairwise_align(Protein(seq1), Protein(seq2), gap_open_penalty, gap_extend_penalty, blosum50) else: aln, score, _ = global_pairwise_align(Protein(seq1), Protein(seq2), gap_open_penalty, gap_extend_penalty, blosum50, penalize_terminal_gaps=True) response = { 'aln1': str(aln[0]), 'aln2': str(aln[1]), 'score': score, 'similarity': float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) * 100)) } return response
def test_eq(self): amino_acids = 'AMPM' * 16 starts = '--M-' * 16 equal_gcs = [ GeneticCode(amino_acids, starts), # name should be ignored GeneticCode(amino_acids, starts, 'foo'), # metadata/positional metadata should be ignored if Sequence # subclass is provided GeneticCode( Protein(amino_acids, metadata={'foo': 'bar'}), Protein(starts, positional_metadata={'foo': range(64)})) ] # every gc should be equal to itself for gc in equal_gcs: self.assertTrue(gc == gc) self.assertFalse(gc != gc) # every pair of gcs should be equal. use permutations instead of # combinations to test that comparing gc1 to gc2 and gc2 to gc1 are # both equal for gc1, gc2 in itertools.permutations(equal_gcs, 2): self.assertTrue(gc1 == gc2) self.assertFalse(gc1 != gc2)
def test_translate_trim_to_cds(self): seq = RNA('UAAUUGCCUCAUUAAUAACAAUGA') # find first start codon, trim all before it, convert alternative start # codon to M, finally trim to first stop codon following the start # codon exp = Protein('MPH') for param in {'require', 'optional'}: obs = self.sgc.translate(seq, start=param, stop=param) self.assertEqual(obs, exp) exp = Protein('*LPH**Q*') obs = self.sgc.translate(seq, start='ignore', stop='ignore') self.assertEqual(obs, exp) # alternative reading frame disrupts cds: # AAUUGCCUCAUUAAUAACAAUGA # NCLINNN with six.assertRaisesRegex(self, ValueError, 'reading_frame=2.*start=\'require\''): self.sgc.translate(seq, reading_frame=2, start='require') with six.assertRaisesRegex(self, ValueError, 'reading_frame=2.*stop=\'require\''): self.sgc.translate(seq, reading_frame=2, stop='require') exp = Protein('NCLINNN') for param in {'ignore', 'optional'}: obs = self.sgc.translate(seq, reading_frame=2, start=param, stop=param) self.assertEqual(obs, exp)
def calculate_sim(target_protein, ): protein_list = target_protein.seq.tolist() protein_num = len(protein_list) sim_matrix = np.zeros(shape=[protein_num, protein_num]) print(f'==Start== with protein : {protein_num}') for i in range(len(protein_list)): for j in range(len(protein_list)): protein_similarity = local_pairwise_align_protein( seq1=Protein(protein_list[i]), seq2=Protein(protein_list[j]), ) print(protein_similarity) sim_matrix[i, j] = protein_similarity[1] print(sim_matrix) sim_value = np.zeros(shape=sim_matrix.shape) for i in range(protein_num): for j in range(protein_num): value = (sim_matrix[i, j] + sim_matrix[j, i]) / (sim_matrix[i, i] + sim_matrix[j, j]) sim_value[i, j] = value sim_value[j, i] = value print(sim_value) return sim_matrix, sim_value
def test_translate_varied_genetic_codes(self): # spot check using a few NCBI and custom genetic codes to translate seq = RNA('AAUGAUGUGACUAUCAGAAGG') # table_id=2 exp = Protein('NDVTI**') obs = GeneticCode.from_ncbi(2).translate(seq) self.assertEqual(obs, exp) exp = Protein('MTI') obs = GeneticCode.from_ncbi(2).translate(seq, start='require', stop='require') self.assertEqual(obs, exp) # table_id=22 exp = Protein('NDVTIRR') obs = GeneticCode.from_ncbi(22).translate(seq) self.assertEqual(obs, exp) with six.assertRaisesRegex(self, ValueError, 'reading_frame=1.*start=\'require\''): GeneticCode.from_ncbi(22).translate(seq, start='require', stop='require') # custom, no start codons gc = GeneticCode('MWN*' * 16, '-' * 64) exp = Protein('MM*MWN*') obs = gc.translate(seq) self.assertEqual(obs, exp) with six.assertRaisesRegex(self, ValueError, 'reading_frame=1.*start=\'require\''): gc.translate(seq, start='require', stop='require')
def test_constructor_not_monomorphic(self): with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'): TabularMSA([DNA(''), RNA('')]) with six.assertRaisesRegex(self, TypeError, 'mixed types.*float.*Protein'): TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
def test_all_gappy(self): aln = TabularMSA( [Protein('---'), Protein('---'), Protein('ALR'), Protein('ELR')]) with pytest.raises(Exception): _ = msa_fun.del_gappy_cols(aln, gap_threshold=0.5)
def test_translate_ncbi_table_id(self): for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'): # default obs = seq.translate() self.assertEqual(obs, Protein('KFMH')) obs = seq.translate(9) self.assertEqual(obs, Protein('NFMH'))
def test_translate_six_frames_preserves_metadata(self): seq = RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, positional_metadata={'foo': range(3)}) obs = list(self.sgc.translate_six_frames(seq))[:2] # metadata retained, positional metadata dropped self.assertEqual( obs, [Protein('M', metadata={'foo': 'bar', 'baz': 42}), Protein('', metadata={'foo': 'bar', 'baz': 42})])
def test_global_pairwise_align_protein_penalize_terminal_gaps(self): obs_msa, obs_score, obs_start_end = global_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5., penalize_terminal_gaps=True) self.assertEqual(obs_msa, TabularMSA([Protein("HEAGAWGHEE"), Protein("---PAWHEAE")])) self.assertEqual(obs_score, 1.0) self.assertEqual(obs_start_end, [(0, 9), (0, 6)])
def test_no_gappy_2(self): # Output should be identical to input aln = TabularMSA( [Protein('-LV'), Protein('A-L'), Protein('AL-'), Protein('ELR')]) out_aln, gappy_idxs = msa_fun.del_gappy_cols(aln, gap_threshold=0.5) assert out_aln == aln assert len(gappy_idxs) == 0
def test_alphabet(self): expected = set("ABCDEFGHIJKLMNOPQRSTUVWXYZ-.*") self.assertIs(type(Protein.alphabet), set) self.assertEqual(Protein.alphabet, expected) Protein.alphabet.add("&") self.assertEqual(Protein.alphabet, expected) self.assertEqual(Protein('').alphabet, expected) with self.assertRaises(AttributeError): Protein('').alphabet = set("ABCD")
def test_same_as_using_StripedSmithWaterman_object_Protein(self): query_sequence = 'HEAGAWGHEE' target_sequence = 'PAWHEAE' query = StripedSmithWaterman(query_sequence, protein=True, substitution_matrix=blosum50) align1 = query(target_sequence) align2 = local_pairwise_align_ssw(Protein(query_sequence), Protein(target_sequence), substitution_matrix=blosum50) self._check_TabularMSA_to_AlignmentStructure(align2, align1, Protein)
def test_init_varied_equivalent_input(self): for args in (('M' * 64, '-' * 64), (Protein('M' * 64), Protein('-' * 64)), (Sequence('M' * 64), Sequence('-' * 64))): gc = GeneticCode(*args) self.assertEqual(gc.name, '') self.assertEqual(gc._amino_acids, Protein('M' * 64)) self.assertEqual(gc._starts, Protein('-' * 64)) npt.assert_array_equal(gc._m_character_codon, np.asarray([0, 0, 0], dtype=np.uint8)) self.assertEqual(len(gc._start_codons), 0)
def test_stop_chars(self): expected = set('*') self.assertIs(type(Protein.stop_chars), set) self.assertEqual(Protein.stop_chars, expected) Protein.stop_chars.add("JO") self.assertEqual(Protein.stop_chars, expected) self.assertEqual(Protein('').stop_chars, expected) with self.assertRaises(AttributeError): Protein('').stop_chars = set("^&")
def test_motif_n_glycosylation(self): seq = Protein("ACDFFACGNPSL") self.assertEqual(list(seq.find_motifs("N-glycosylation")), []) seq = Protein("ACDFNFTACGNPSL") self.assertEqual(list(seq.find_motifs("N-glycosylation")), [slice(4, 8)]) seq = Protein("AC-DFN-FTACGNPSL") self.assertEqual( list(seq.find_motifs("N-glycosylation", ignore=seq.gaps())), [slice(5, 10)])
def align(seq1, seq2, go, ge): ''' Perform alignment using scikit-bio for any two given sequences, gap penalties, and score matrix. ''' a, b = read_seq(seq1, seq2) # scoreMatrix = read_matrix(sys.argv[1]) alignment, score, start_end_positions = local_pairwise_align_protein( Protein(a, lowercase=True), Protein(b, lowercase=True), gap_open_penalty=go, gap_extend_penalty=ge, substitution_matrix=None) print("\nScore:", score) return score
def test_translate_six_frames_preserves_metadata(self): metadata = {'foo': 'bar', 'baz': 42} positional_metadata = {'foo': range(3)} for seq in (RNA('AUG', metadata=metadata, positional_metadata=positional_metadata), DNA('ATG', metadata=metadata, positional_metadata=positional_metadata)): obs = list(seq.translate_six_frames())[:2] # metadata retained, positional metadata dropped self.assertEqual( obs, [Protein('M', metadata={'foo': 'bar', 'baz': 42}), Protein('', metadata={'foo': 'bar', 'baz': 42})])
def test_genbank_to_protein(self): i = 0 exp = self.multi[i] obs = _genbank_to_protein(self.multi_fp, seq_num=i+1) exp = Protein(exp[0], metadata=exp[1], lowercase=True, positional_metadata=exp[2]) self.assertEqual(exp, obs)
def test_process_1(self): aln = TabularMSA([Protein('AL-'), Protein('VL-'), Protein('MLA')]) gap_thr = 0.5 exp_num = [[AA_TABLE['A']], [AA_TABLE['V']], [AA_TABLE['M']]] exp_bin = [[ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]] num_mtx, bin_mtx, gappy_idxs, constant_idxs = preprocess.process( aln, gap_thr, AA_TABLE) assert np.array_equal(exp_num, num_mtx) assert np.array_equal(exp_bin, bin_mtx) assert gappy_idxs == [2] assert constant_idxs == [1]
def test_translate_six_frames_passes_parameters_through(self): for seq in RNA('UUUAUGUGGUGA'), DNA('TTTATGTGGTGA'): # mix of args and kwargs obs = next(seq.translate_six_frames(11, start='require', stop='require')) self.assertEqual(obs, Protein('MW')) # kwargs only obs = next(seq.translate_six_frames(genetic_code=11, start='require', stop='require')) self.assertEqual(obs, Protein('MW')) # args only obs = next(seq.translate_six_frames(11, 'require', 'require')) self.assertEqual(obs, Protein('MW'))
def test_translate_reading_frame_non_empty_translation(self): seq = RNA('AUGGUGGAA') # rc = UUCCACCAU for reading_frame, exp_str in ((1, 'MVE'), (2, 'WW'), (3, 'GG'), (-1, 'FHH'), (-2, 'ST'), (-3, 'PP')): exp = Protein(exp_str) obs = self.sgc.translate(seq, reading_frame=reading_frame) self.assertEqual(obs, exp)
def test_translate_preserves_metadata(self): obs = self.sgc.translate( RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, positional_metadata={'foo': range(3)})) # metadata retained, positional metadata dropped self.assertEqual(obs, Protein('M', metadata={'foo': 'bar', 'baz': 42}))
def test_sam_to_protein(self): self.maxDiff = None obs = _sam_to_protein(self.single_fp) exp = Protein(self.single_exp[0], self.single_exp[1]) self.assertEqual(sorted(obs.metadata.items()), sorted(exp.metadata.items())) self.assertEqual(str(obs), str(exp))
def test_process_2(self): # Invert columns 1 and 2 with respect to the previous example aln = TabularMSA([Protein('A-L'), Protein('V-L'), Protein('MAL')]) gap_thr = 0.5 exp_num = [[AA_TABLE['A']], [AA_TABLE['V']], [AA_TABLE['M']]] exp_bin = [[ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]] num_mtx, bin_mtx, gappy_idxs, constant_idxs = preprocess.process( aln, gap_thr, AA_TABLE) assert np.array_equal(exp_num, num_mtx) assert np.array_equal(exp_bin, bin_mtx) assert gappy_idxs == [1] assert constant_idxs == [1]
def test_degenerate_map(self): exp = { 'B': set(['D', 'N']), 'Z': set(['E', 'Q']), 'X': set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']) } self.assertEqual(Protein("").degenerate_map, exp) self.assertEqual(Protein.degenerate_map, exp)
def test_metadata_setter_invalid_type(self): msa = TabularMSA([Protein('PAW')], metadata={123: 456}) for md in (None, 0, 'a', ('f', 'o', 'o'), np.array([]), pd.DataFrame()): with six.assertRaisesRegex(self, TypeError, 'metadata must be a dict'): msa.metadata = md self.assertEqual(msa.metadata, {123: 456})
def test_no_protein_support(self): """Testing no protein support for embl""" # TODO: add protein support # a fake protein line. handle = io.StringIO('ID M14399; SV 1; linear; mRNA; STD; ' 'PRO; 63 AA.\n//\n') with self.assertRaisesRegex(EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a protein record Protein.read(handle) # return to 0 handle.seek(0) with self.assertRaisesRegex(EMBLFormatError, r"There's no protein support for EMBL " "record"): # read a generic record skbio.io.read(handle, format='embl')
def test_motif_n_glycosylation(self): seq = Protein("ACDFFACGNPSL") self.assertEqual(list(seq.find_motifs("N-glycosylation")), []) seq = Protein("ACDFNFTACGNPSL") self.assertEqual(list(seq.find_motifs("N-glycosylation")), [slice(4, 8)]) seq = Protein("AC-DFN-FTACGNPSL") self.assertEqual(list(seq.find_motifs("N-glycosylation", ignore=seq.gaps())), [slice(5, 10)])
def mask_sequence(hhsuite_fp, fullsequence_fp, subsequences_fp=None, min_prob=None, max_pvalue=None, max_evalue=None, min_fragment_length=0): """ Splits a protein sequence according to HHsuits results. The returned sub-sequences will seamlessly build the full sequence if re-concatenated. Parameters ---------- hhsuite_fp : str Filepath to HHblits/HHsearch output. fullsequence_fp : str Filepath to the protein sequence of the original query. subsequences_fp : str Filepath to which sub-sequences are written as a multiple fasta file. Each sequence makes up one header and one sequence file, i.e. sequences are not wrapped. Two files will be produced, suffixed by '.match' and '.non_match'. The first holds sub-sequences of hits, the second holds the none-hit covered subsequences. Default: None, i.e. no file is written. min_prob: float Minimal probability of a hit to be included in the resulting list. Note: probabilities are in the range of 100.0 to 0.0. Default: None, i.e. no filtering on probability. max_pvalue: float Maximal P-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on P-value. max_evalue: float Maximal E-value of a hit to be included in the resulting list. Default: None, i.e. no filtering on E-value. min_fragment_length: int Minimal fragment length of a hit to be included in the resulting list. Default: 0, i.e. no filtering on fragment length. Returns ------- [(str, str)] where first component is a fasta header, while the second is its fasta sequence. Raises ------ IOError If the file cannot be written. Notes ----- A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue, min_fragment_length) to be included in the resulting list. """ # parse hits from file hits = parse_pdb_match(hhsuite_fp) # filter hits if min_prob is not None: hits = [hit for hit in hits if hit['Probab'] >= min_prob] if max_pvalue is not None: hits = [hit for hit in hits if hit['P-value'] <= max_pvalue] if max_evalue is not None: hits = [hit for hit in hits if hit['E-value'] <= max_evalue] if min_fragment_length is not None: hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length] # read the original protein file, used to run HHsearch p = Protein.read(fullsequence_fp, seq_num=1) query_id = p.metadata['id'] query_desc = p.metadata['description'] results = {'match': [], 'non_match': []} # select non overlapping positive hits subseqs_pos = select_hits(hits, e_value_threshold=999999) for hit in subseqs_pos: _id = get_q_id(hit) match_id = hit['Hit'].split()[0] header = "%s %s %s" % (correct_header_positions( query_id, hit['alignment'][_id]['start'], hit['alignment'][_id]['end']), '# %s' % match_id, query_desc) seq = hit['alignment'][_id]['sequence'].replace('-', '') results['match'].append((header, seq, hit['alignment'][_id]['start'])) # collect gaps between positive hits subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p), min_fragment_length) for hit in subseqs_neg: header = "%s %s" % (correct_header_positions( query_id, hit['start'], hit['end']), query_desc) seq = hit['sequence'] results['non_match'].append((header, seq, hit['start'])) # write sub-sequences to a multiple fasta file, sequences are un-wrapped try: # sort by start position for type_ in results: results[type_] = sorted(results[type_], key=lambda x: x[2]) if subsequences_fp is not None: for type_ in results: f = open('%s.%s' % (subsequences_fp, type_), 'w') for res in results[type_]: f.write(">%s\n%s\n" % res[:2]) f.close() # removing the start position component from all subsequences return {type_: list(map(lambda x: x[:2], results[type_])) for type_ in results} except IOError: raise IOError('Cannot write to file "%s"' % subsequences_fp)