def predict(self): self.h2_prediction = self.get_antigen_gene_blast_results( self.h2_prediction, FLJB_FASTA_PATH) if not self.h2_prediction.is_missing: if not self.h2_prediction.is_perfect_match: top_result = self.h2_prediction.top_result match_len = top_result['length'] pident = top_result['pident'] # short lower %ID matches are treated as missing or '-' for H2 if match_len <= 600 and pident < 88.0: self.h2_prediction.h2 = '-' self.h2_prediction.is_missing = True return if match_len <= 600 and not self.h2_prediction.is_trunc: self.h2_prediction.h2 = '-' self.h2_prediction.is_missing = True return df_blast_results = pd.DataFrame( self.h2_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 50) & (df_blast_results['length'] >= 700)] if df_blast_results.shape[0] == 0: self.h2_prediction.is_missing = True self.h2_prediction.top_result = None self.h2_prediction.h2 = '-' return df_blast_results_over1000 = df_blast_results[ (df_blast_results['mismatch'] <= 5) & (df_blast_results['length'] >= 1000)] if df_blast_results_over1000.shape[0] > 0: df_blast_results = df_blast_results_over1000.sort_values( by='mismatch') else: df_blast_results = df_blast_results.sort_values( by='bitscore', ascending=False) result_dict = BlastReader.df_first_row_to_dict( df_blast_results) result_trunc = BlastReader.is_blast_result_trunc( qstart=result_dict['qstart'], qend=result_dict['qend'], sstart=result_dict['sstart'], send=result_dict['send'], qlen=result_dict['qlen'], slen=result_dict['slen']) self.h2_prediction.top_result = result_dict self.h2_prediction.is_trunc = result_trunc self.h2_prediction.h2 = get_antigen_name( self.h2_prediction.top_result['qseqid']) if self.h2_prediction.is_missing: self.h2_prediction.h2 = '-'
def get_antigen_gene_blast_results(self, model_obj, antigen_gene_fasta): blast_outfile = self.blast_runner.blast_against_query( antigen_gene_fasta) blast_reader = BlastReader(blast_outfile) is_missing = blast_reader.is_missing model_obj.is_missing = is_missing if not is_missing: model_obj.blast_results = blast_reader.df_dict() model_obj.top_result = blast_reader.top_result() model_obj.is_perfect_match = blast_reader.is_perfect_match model_obj.is_trunc = blast_reader.is_trunc return model_obj
def predict(self, filter=['N/A']): self.h1_prediction = self.get_antigen_gene_blast_results( self.h1_prediction, FLIC_FASTA_PATH, filter) if not self.h1_prediction.is_missing and self.h1_prediction.top_result is not None: if not self.h1_prediction.is_perfect_match: df_blast_results = pd.DataFrame( self.h1_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 25) & (df_blast_results['length'] >= 700)] if df_blast_results.shape[0] == 0: df_blast_results = pd.DataFrame( self.h1_prediction.blast_results) df_blast_results = df_blast_results[ (df_blast_results['mismatch'] <= 0) & (df_blast_results['length'] >= 400)] if df_blast_results.shape[0] == 0: self.h1_prediction.is_missing = True self.h1_prediction.top_result = None self.h1_prediction.h1 = None return df_blast_results_over1000 = df_blast_results[ (df_blast_results['mismatch'] <= 5) & (df_blast_results['length'] >= 1000)] if df_blast_results_over1000.shape[0] > 0: df_blast_results = df_blast_results_over1000.sort_values( by='mismatch') else: df_blast_results = df_blast_results.sort_values( by='bitscore', ascending=False) result_dict = BlastReader.df_first_row_to_dict( df_blast_results) result_trunc = BlastReader.is_blast_result_trunc( qstart=result_dict['qstart'], qend=result_dict['qend'], sstart=result_dict['sstart'], send=result_dict['send'], qlen=result_dict['qlen'], slen=result_dict['slen']) self.h1_prediction.top_result = result_dict self.h1_prediction.is_trunc = result_trunc self.h1_prediction.h1 = get_antigen_name( self.h1_prediction.top_result['qseqid'])
def test_BlastReader_is_blast_result_trunc(): # not truncated; match found in the middle of the subject sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=100, sstart=101, send=200, qlen=100, slen=1000) # not truncated; shorter match (-10bp) found in the middle of the subject # sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=90, sstart=101, send=190, qlen=100, slen=1000) # not truncated; shorter match (-20bp) found in the middle of the subject # sequence assert not BlastReader.is_blast_result_trunc(qstart=1, qend=80, sstart=101, send=180, qlen=100, slen=1000) # truncated at the start of the subject assert BlastReader.is_blast_result_trunc(qstart=51, qend=100, sstart=1, send=50, qlen=100, slen=1000) # truncated at the end of the subject assert BlastReader.is_blast_result_trunc(qstart=51, qend=100, sstart=951, send=1000, qlen=100, slen=1000)
def run_cgmlst(blast_runner, full=False): """Perform in silico cgMLST on an input genome Args: blast_runner (sistr.src.blast_wrapper.BlastRunner): blastn runner object with genome fasta initialized Returns: dict: cgMLST ref genome match, distance to closest ref genome, subspecies and serovar predictions dict: marker allele match results (seq, allele name, blastn results) """ from sistr.src.serovar_prediction.constants import genomes_to_serovar df_cgmlst_profiles = ref_cgmlst_profiles() logging.debug('{} distinct cgMLST330 profiles'.format( df_cgmlst_profiles.shape[0])) logging.info('Running BLAST on serovar predictive cgMLST330 alleles') cgmlst_fasta_path = CGMLST_CENTROID_FASTA_PATH if not full else CGMLST_FULL_FASTA_PATH blast_outfile = blast_runner.blast_against_query(cgmlst_fasta_path) logging.info('Reading BLAST output file "{}"'.format(blast_outfile)) blast_reader = BlastReader(blast_outfile) if blast_reader.df is None: logging.error('No cgMLST330 alleles found!') return ( { 'distance': 1.0, 'genome_match': None, 'serovar': None, 'matching_alleles': 0, 'subspecies': None, 'cgmlst330_ST': None, }, {}, ) logging.info('Found {} cgMLST330 allele BLAST results'.format( blast_reader.df.shape[0])) df_cgmlst_blastn = process_cgmlst_results(blast_reader.df) marker_match_results = matches_to_marker_results( df_cgmlst_blastn[df_cgmlst_blastn.is_match]) contig_blastn_records = alleles_to_retrieve(df_cgmlst_blastn) retrieved_marker_alleles = get_allele_sequences(blast_runner.fasta_path, contig_blastn_records, full=full) logging.info('Type retrieved_marker_alleles %s', type(retrieved_marker_alleles)) all_marker_results = marker_match_results.copy() found_cgmlst_genes = 0 for marker, res in retrieved_marker_alleles.items(): all_marker_results[marker] = res for marker in df_cgmlst_profiles.columns: if marker not in all_marker_results: all_marker_results[marker] = { 'blast_result': None, 'name': None, 'seq': None, } cgmlst_results = {} for marker, res in all_marker_results.items(): try: cgmlst_results[marker] = int(res['name']) found_cgmlst_genes += 1 except: logging.error('Missing cgmlst_results for %s', marker) logging.debug(res) logging.info( 'Calculating number of matching alleles to serovar predictive cgMLST330 profiles' ) df_relatives = find_closest_related_genome(cgmlst_results, df_cgmlst_profiles) genome_serovar_dict = genomes_to_serovar() df_relatives['serovar'] = [ genome_serovar_dict[genome] for genome in df_relatives.index ] logging.debug('Top 5 serovar predictive cgMLST profiles:\n{}'.format( df_relatives.head())) spp = None subspeciation_tuple = cgmlst_subspecies_call(df_relatives) if subspeciation_tuple is not None: spp, distance, spp_counter = subspeciation_tuple logging.info( 'Top subspecies by cgMLST is "{}" (min dist={}, Counter={})'. format(spp, distance, spp_counter)) else: logging.warning('Subspeciation by cgMLST was not possible!') cgmlst_serovar = None cgmlst_matching_genome = None cgmlst_matching_alleles = 0 cgmlst_distance = 1.0 for idx, row in df_relatives.iterrows(): cgmlst_distance = row['distance'] cgmlst_matching_alleles = row['matching'] cgmlst_found_loci = found_cgmlst_genes cgmlst_serovar = row['serovar'] if cgmlst_distance <= 1.0 else None cgmlst_matching_genome = idx if cgmlst_distance <= 1.0 else None logging.info( 'Top serovar by cgMLST profile matching: "{}" with {} matching alleles, distance={:.1%}' .format(cgmlst_serovar, cgmlst_matching_alleles, cgmlst_distance)) break cgmlst_st = None cgmlst_markers_sorted = sorted(all_marker_results.keys()) cgmlst_allele_names = [] marker = None for marker in cgmlst_markers_sorted: try: aname = all_marker_results[marker]['name'] if aname: cgmlst_allele_names.append(str(aname)) else: break except: break if len(cgmlst_allele_names) == len(cgmlst_markers_sorted): cgmlst_st = allele_name('-'.join(cgmlst_allele_names)) logging.info('cgMLST330 Sequence Type=%s', cgmlst_st) else: logging.warning( 'Could not compute cgMLST330 Sequence Type due to missing data (marker %s)', marker) return ( { 'distance': cgmlst_distance, 'genome_match': cgmlst_matching_genome, 'serovar': cgmlst_serovar, 'matching_alleles': cgmlst_matching_alleles, 'found_loci': cgmlst_found_loci, 'subspecies': spp, 'cgmlst330_ST': cgmlst_st, }, all_marker_results, )
def test_BlastReader(blast_runner): blast_outfile = blast_runner.run_blast(WZX_FASTA_PATH) blast_reader = BlastReader(blast_outfile) top_result = blast_reader.top_result() assert get_antigen_name(top_result['qseqid']) == 'O58'