def test_MinimalRfamParser_strict_invalid_structure(self): """MinimalRfamParser: toggle strict functions w/ invalid structure """ #strict = True self.assertRaises(RecordError,list,\ MinimalRfamParser(self._fake_record_bad_structure_1)) # strict = False self.assertEqual(list(MinimalRfamParser(\ self._fake_record_bad_structure_1,strict=False))[0][2],None)
def test_MinimalRfamParser_strict_invalid_sequence(self): """MinimalRfamParser: toggle strict functions w/ invalid seq """ #strict = True self.assertRaises(RecordError,list,\ MinimalRfamParser(self._fake_record_bad_sequence_1)) # strict = False # you expect to get back as much information as possible, also # half records or sequences result = MinimalRfamParser(self._fake_record_bad_sequence_1,strict=False) self.assertEqual(len(list(MinimalRfamParser(\ self._fake_record_bad_sequence_1,strict=False))[0][1].NamedSeqs), 3)
def test_MinimalRfamParser_w_valid_data(self): """MinimalRfamParser: integrity of output """ # Some ugly constructions here, but this is what the output of # parsing fake_two_records should be headers = ['#=GF AC RF00014','#=GF AU Mifsud W'] sequences =\ {'U17136.1/898-984':\ ''.join(['AACACAUCAGAUUUCCUGGUGUAACGAAUUUUUUAAGUGCUUCUUGCUUA',\ 'AGCAAGUUUCAUCCCGACCCCCUCAGGGUCGGGAUUU']),\ 'M15749.1/155-239':\ ''.join(['AACGCAUCGGAUUUCCCGGUGUAACGAA-UUUUCAAGUGCUUCUUGCAUU',\ 'AGCAAGUUUGAUCCCGACUCCUG-CGAGUCGGGAUUU']),\ 'AF090431.1/222-139':\ ''.join(['CUCACAUCAGAUUUCCUGGUGUAACGAA-UUUUCAAGUGCUUCUUGCAUA',\ 'AGCAAGUUUGAUCCCGACCCGU--AGGGCCGGGAUUU'])} structure = WussStructure(''.join(\ ['...<<<<<<<.....>>>>>>>....................<<<<<...',\ '.>>>>>....<<<<<<<<<<.....>>>>>>>>>>..'])) data = [] for r in MinimalRfamParser(self._fake_two_records, strict=False): data.append(r) self.assertEqual(data[0],(headers,sequences,structure)) assert isinstance(data[0][1],Alignment) # This line tests that invalid entries are ignored when strict=False # Note, there are two records in self._fake_two_records, but 2nd is # invalid self.assertEqual(len(data),1)
def test_MinimalRfamParser_strict_missing_fields(self): """MinimalRfamParser: toggle strict functions w/ missing fields""" # strict = True self.assertRaises(RecordError,list,\ MinimalRfamParser(self._fake_record_no_sequences)) self.assertRaises(RecordError,list,\ MinimalRfamParser(self._fake_record_no_structure)) # strict = False # no header shouldn't be a problem self.assertEqual(list(MinimalRfamParser(self._fake_record_no_headers,\ strict=False)), [([],{'Z11765.1/1-89':'GGUC'},'............>>>')]) # should get empty on missing sequence or missing structure self.assertEqual(list(MinimalRfamParser(self._fake_record_no_sequences,\ strict=False)), []) self.assertEqual(list(MinimalRfamParser(self._fake_record_no_structure,\ strict=False)), [])
def test_RfamParser_strict_invalid_sequences(self): """RfamParser: functions when toggling strict w/ record w/ bad seq """ self.assertRaises(RecordError,list, MinimalRfamParser(self._fake_record_bad_sequence_1)) # strict = False # in 'False' mode you expect to get back as much as possible, also # parts of sequences self.assertEqual(len(list(RfamParser(self._fake_record_bad_sequence_1,\ strict=False))[0][1].NamedSeqs), 3)
def __call__(self, seq_path, result_path=None, log_path=None, \ failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open(\ self.Params['template_filepath'],'U'),\ seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner." moltype = self.Params['moltype'] #Need to make separate mapping for unaligned sequences unaligned = SequenceCollection(candidate_sequences,MolType=moltype) int_map, int_keys = unaligned.getIntMap(prefix='unaligned_') int_map = SequenceCollection(int_map,MolType=moltype) #Turn on --gapthresh option in cmbuild to force alignment to full model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh':1.0}) #record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) #Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. #Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub':True,'--gapthresh':1.0}) #record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) #Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\ structure_string=struct,\ seqs=int_map,\ moltype=moltype,\ include_aln=True,\ params=cmalign_params,\ cmbuild_params=cmbuild_params) #Pull out original sequences from full alignment. infernal_aligned={} aligned_dict = aligned.NamedSeqs for key in int_map.Names: infernal_aligned[int_keys.get(key,key)]=aligned_dict[key] #Create an Alignment object from alignment dict infernal_aligned = Alignment(infernal_aligned,MolType=moltype) if log_path is not None: log_file = open(log_path,'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path,'w') result_file.write(infernal_aligned.toFasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the ids to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}