Esempio n. 1
0
    def get_antigen_gene_blast_results(self, model_obj, antigen_gene_fasta):
        blast_outfile = self.blast_runner.blast_against_query(
            antigen_gene_fasta)
        blast_reader = BlastReader(blast_outfile)
        is_missing = blast_reader.is_missing
        model_obj.is_missing = is_missing
        if not is_missing:
            model_obj.blast_results = blast_reader.df_dict()

            model_obj.top_result = blast_reader.top_result()
            model_obj.is_perfect_match = blast_reader.is_perfect_match
            model_obj.is_trunc = blast_reader.is_trunc

        return model_obj
Esempio n. 2
0
def run_cgmlst(blast_runner, full=False):
    """Perform in silico cgMLST on an input genome

    Args:
        blast_runner (sistr.src.blast_wrapper.BlastRunner): blastn runner object with genome fasta initialized

    Returns:
        dict: cgMLST ref genome match, distance to closest ref genome, subspecies and serovar predictions
        dict: marker allele match results (seq, allele name, blastn results)
    """
    from sistr.src.serovar_prediction.constants import genomes_to_serovar

    df_cgmlst_profiles = ref_cgmlst_profiles()

    logging.debug('{} distinct cgMLST330 profiles'.format(
        df_cgmlst_profiles.shape[0]))

    logging.info('Running BLAST on serovar predictive cgMLST330 alleles')
    cgmlst_fasta_path = CGMLST_CENTROID_FASTA_PATH if not full else CGMLST_FULL_FASTA_PATH
    blast_outfile = blast_runner.blast_against_query(cgmlst_fasta_path)
    logging.info('Reading BLAST output file "{}"'.format(blast_outfile))
    blast_reader = BlastReader(blast_outfile)
    if blast_reader.df is None:
        logging.error('No cgMLST330 alleles found!')
        return (
            {
                'distance': 1.0,
                'genome_match': None,
                'serovar': None,
                'matching_alleles': 0,
                'subspecies': None,
                'cgmlst330_ST': None,
            },
            {},
        )
    logging.info('Found {} cgMLST330 allele BLAST results'.format(
        blast_reader.df.shape[0]))

    df_cgmlst_blastn = process_cgmlst_results(blast_reader.df)

    marker_match_results = matches_to_marker_results(
        df_cgmlst_blastn[df_cgmlst_blastn.is_match])
    contig_blastn_records = alleles_to_retrieve(df_cgmlst_blastn)
    retrieved_marker_alleles = get_allele_sequences(blast_runner.fasta_path,
                                                    contig_blastn_records,
                                                    full=full)
    logging.info('Type retrieved_marker_alleles %s',
                 type(retrieved_marker_alleles))
    all_marker_results = marker_match_results.copy()
    found_cgmlst_genes = 0
    for marker, res in retrieved_marker_alleles.items():
        all_marker_results[marker] = res
    for marker in df_cgmlst_profiles.columns:
        if marker not in all_marker_results:
            all_marker_results[marker] = {
                'blast_result': None,
                'name': None,
                'seq': None,
            }
    cgmlst_results = {}

    for marker, res in all_marker_results.items():
        try:
            cgmlst_results[marker] = int(res['name'])
            found_cgmlst_genes += 1
        except:
            logging.error('Missing cgmlst_results for %s', marker)
            logging.debug(res)
    logging.info(
        'Calculating number of matching alleles to serovar predictive cgMLST330 profiles'
    )
    df_relatives = find_closest_related_genome(cgmlst_results,
                                               df_cgmlst_profiles)
    genome_serovar_dict = genomes_to_serovar()
    df_relatives['serovar'] = [
        genome_serovar_dict[genome] for genome in df_relatives.index
    ]
    logging.debug('Top 5 serovar predictive cgMLST profiles:\n{}'.format(
        df_relatives.head()))
    spp = None
    subspeciation_tuple = cgmlst_subspecies_call(df_relatives)
    if subspeciation_tuple is not None:
        spp, distance, spp_counter = subspeciation_tuple
        logging.info(
            'Top subspecies by cgMLST is "{}" (min dist={}, Counter={})'.
            format(spp, distance, spp_counter))
    else:
        logging.warning('Subspeciation by cgMLST was not possible!')

    cgmlst_serovar = None
    cgmlst_matching_genome = None
    cgmlst_matching_alleles = 0
    cgmlst_distance = 1.0
    for idx, row in df_relatives.iterrows():
        cgmlst_distance = row['distance']
        cgmlst_matching_alleles = row['matching']
        cgmlst_found_loci = found_cgmlst_genes
        cgmlst_serovar = row['serovar'] if cgmlst_distance <= 1.0 else None
        cgmlst_matching_genome = idx if cgmlst_distance <= 1.0 else None
        logging.info(
            'Top serovar by cgMLST profile matching: "{}" with {} matching alleles, distance={:.1%}'
            .format(cgmlst_serovar, cgmlst_matching_alleles, cgmlst_distance))
        break

    cgmlst_st = None
    cgmlst_markers_sorted = sorted(all_marker_results.keys())
    cgmlst_allele_names = []
    marker = None
    for marker in cgmlst_markers_sorted:
        try:
            aname = all_marker_results[marker]['name']
            if aname:
                cgmlst_allele_names.append(str(aname))
            else:
                break
        except:
            break
    if len(cgmlst_allele_names) == len(cgmlst_markers_sorted):
        cgmlst_st = allele_name('-'.join(cgmlst_allele_names))
        logging.info('cgMLST330 Sequence Type=%s', cgmlst_st)
    else:
        logging.warning(
            'Could not compute cgMLST330 Sequence Type due to missing data (marker %s)',
            marker)
    return (
        {
            'distance': cgmlst_distance,
            'genome_match': cgmlst_matching_genome,
            'serovar': cgmlst_serovar,
            'matching_alleles': cgmlst_matching_alleles,
            'found_loci': cgmlst_found_loci,
            'subspecies': spp,
            'cgmlst330_ST': cgmlst_st,
        },
        all_marker_results,
    )
Esempio n. 3
0
def test_BlastReader(blast_runner):
    blast_outfile = blast_runner.run_blast(WZX_FASTA_PATH)
    blast_reader = BlastReader(blast_outfile)
    top_result = blast_reader.top_result()
    assert get_antigen_name(top_result['qseqid']) == 'O58'