Esempio n. 1
0
def cgmlst_profiles_df(fastas, cgmlst_results):
    genome_marker_cgmlst_result = {}
    for fasta, res in zip(fastas, cgmlst_results):
        genome = genome_name_from_fasta_path(fasta)
        tmp = {}
        for marker, res_dict in res.items():
            aname = res_dict['name']
            tmp[marker] = int(aname) if aname is not None else None
        genome_marker_cgmlst_result[genome] = tmp
    return pd.DataFrame(genome_marker_cgmlst_result).transpose()
Esempio n. 2
0
def cgmlst_profiles_df(fastas, cgmlst_results):
    genome_marker_cgmlst_result = {}
    for fasta, res in zip(fastas, cgmlst_results):
        genome = genome_name_from_fasta_path(fasta)
        tmp = {}
        for marker, res_dict in res.items():
            aname = res_dict['name']
            tmp[marker] = int(aname) if aname is not None else None
        genome_marker_cgmlst_result[genome] = tmp
    return pd.DataFrame(genome_marker_cgmlst_result).transpose()
Esempio n. 3
0
def sketch_fasta(fasta_path, outdir):
    """Create a Mash sketch from an input fasta file

    Args:
        fasta_path (str): input fasta file path. Genome name in fasta filename
        outdir (str): output directory path to write Mash sketch file to

    Returns:
        str: output Mash sketch file path
    """
    genome_name = genome_name_from_fasta_path(fasta_path)
    outpath = os.path.join(outdir, genome_name)
    args = ['mash', 'sketch', '-o', outpath, fasta_path]
    logging.info('Running Mash sketch with command: %s', ' '.join(args))
    p = Popen(args)
    p.wait()
    sketch_path = outpath + '.msh'
    assert os.path.exists(sketch_path), 'Mash sketch for genome {} was not created at {}'.format(
        genome_name,
        sketch_path)
    return sketch_path
Esempio n. 4
0
def sketch_fasta(fasta_path, outdir):
    """Create a Mash sketch from an input fasta file

    Args:
        fasta_path (str): input fasta file path. Genome name in fasta filename
        outdir (str): output directory path to write Mash sketch file to

    Returns:
        str: output Mash sketch file path
    """
    genome_name = genome_name_from_fasta_path(fasta_path)
    outpath = os.path.join(outdir, genome_name)
    args = ['mash', 'sketch', '-o', outpath, fasta_path]
    logging.info('Running Mash sketch with command: %s', ' '.join(args))
    p = Popen(args)
    p.wait()
    sketch_path = outpath + '.msh'
    assert os.path.exists(
        sketch_path), 'Mash sketch for genome {} was not created at {}'.format(
            genome_name, sketch_path)
    return sketch_path
Esempio n. 5
0
def run_sistr(input_fasta, tmp_dir):
    blast_runner = None
    try:
        assert os.path.exists(
            input_fasta), "Input fasta file '%s' must exist!" % input_fasta
        fasta_filename = os.path.basename(input_fasta)
        genome_name = genome_name_from_fasta_path(input_fasta)
        dtnow = datetime.now()
        genome_tmp_dir = os.path.join(
            tmp_dir,
            dtnow.strftime("%Y%m%d%H%M%S") + '-' + 'SISTR' + '-' + genome_name)
        blast_runner = BlastRunner(input_fasta, genome_tmp_dir)
        logging.info(
            'Initializing temporary analysis directory "%s" and preparing for BLAST searching.',
            genome_tmp_dir)
        blast_runner.prep_blast()
        logging.info('Temporary FASTA file copied to %s',
                     blast_runner.tmp_fasta_path)

        cgmlst_prediction, cgmlst_results = run_cgmlst(blast_runner)

        spp = cgmlst_prediction['subspecies']

        serovar_predictor = SerovarPredictor(blast_runner, spp)
        serovar_predictor.predict_serovar_from_antigen_blast()
        prediction = serovar_predictor.get_serovar_prediction()
        merge_cgmlst_prediction(prediction, cgmlst_prediction)
        overall_serovar_call(prediction, serovar_predictor)
        logging.info(
            '%s | Antigen gene BLAST serovar prediction: "%s" serogroup=%s:H1=%s:H2=%s',
            fasta_filename, prediction.serovar_antigen, prediction.serogroup,
            prediction.h1, prediction.h2)
        logging.info('%s | Subspecies prediction: %s', fasta_filename, spp)
        logging.info('%s | Overall serovar prediction: %s', fasta_filename,
                     prediction.serovar)
    finally:
        logging.info('Deleting temporary working directory at %s',
                     blast_runner.tmp_work_dir)
        blast_runner.cleanup()
    return prediction, cgmlst_results
Esempio n. 6
0
def run_sistr(input_fasta, tmp_dir):
    blast_runner = None
    try:
        assert os.path.exists(input_fasta), "Input fasta file '%s' must exist!" % input_fasta
        fasta_filename = os.path.basename(input_fasta)
        genome_name = genome_name_from_fasta_path(input_fasta)
        dtnow = datetime.now()
        genome_tmp_dir = os.path.join(tmp_dir, dtnow.strftime("%Y%m%d%H%M%S") + '-' + 'SISTR' + '-' + genome_name)
        blast_runner = BlastRunner(input_fasta, genome_tmp_dir)
        logging.info('Initializing temporary analysis directory "%s" and preparing for BLAST searching.',
                     genome_tmp_dir)
        blast_runner.prep_blast()
        logging.info('Temporary FASTA file copied to %s', blast_runner.tmp_fasta_path)

        cgmlst_prediction, cgmlst_results = run_cgmlst(blast_runner)

        spp = cgmlst_prediction['subspecies']

        serovar_predictor = SerovarPredictor(blast_runner, spp)
        serovar_predictor.predict_serovar_from_antigen_blast()
        prediction = serovar_predictor.get_serovar_prediction()
        merge_cgmlst_prediction(prediction, cgmlst_prediction)
        overall_serovar_call(prediction, serovar_predictor)
        logging.info('%s | Antigen gene BLAST serovar prediction: "%s" serogroup=%s:H1=%s:H2=%s',
                     fasta_filename,
                     prediction.serovar_antigen,
                     prediction.serogroup,
                     prediction.h1,
                     prediction.h2)
        logging.info('%s | Subspecies prediction: %s',
                     fasta_filename,
                     spp)
        logging.info('%s | Overall serovar prediction: %s',
                     fasta_filename,
                     prediction.serovar)
    finally:
        logging.info('Deleting temporary working directory at %s', blast_runner.tmp_work_dir)
        blast_runner.cleanup()
    return prediction, cgmlst_results
Esempio n. 7
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.debug(args)
    input_fastas = args.fastas
    outdir = args.outdir
    tmp_dir = args.tmp_dir
    serovar_table_path = args.serovar_table
    threads = args.threads
    force = args.force

    assert len(input_fastas) > 0, 'No FASTA files specified!'
    for input_fasta in input_fastas:
        assert os.path.exists(input_fasta), 'Genome FASTA file does not exist at "{}"'.format(input_fasta)
    genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    logging.info('You have specified %s genomes to add to current sistr_cmd data files! %s',
                 len(genome_names),
                 genome_names)

    if os.path.exists(outdir):
        if not force:
            raise Exception('Output directory already exists at {}!'.format(outdir))
        else:
            logging.warning('Using existing output directory at %s', outdir)
    try:
        os.makedirs(outdir)
    except:
        pass
    assert os.path.exists(outdir), 'Output directory could not be created!'

    if serovar_table_path:
        assert os.path.exists(serovar_table_path), 'Provided serovar table path does not exist! {}'.format(
            serovar_table_path)
        logging.info('Parsing serovar table from "%s"', serovar_table_path)
        if re.match(r'.*.csv$', serovar_table_path):
            logging.info('Trying to read serovar table "%s" as CSV', serovar_table_path)
            df_serovar = pd.read_csv(serovar_table_path)
        else:
            logging.info('Trying to read serovar table "%s" as tab-delimited', serovar_table_path)
            df_serovar = pd.read_table(serovar_table_path)
        expected_columns = ['genome', 'serovar']
        assert np.all(
            df_serovar.columns.isin(expected_columns)), 'User serovar table did not contain expected columns {}'.format(
            expected_columns)
        if 'subspecies' not in df_serovar.columns:
            logging.warning(
                'User serovar table did not contain "subspecies" column so the sistr_cmd subspecies prediction will be used!')
        genome_names_series = pd.Series(genome_names)
        genomes_in_serovar_table = genome_names_series.isin(df_serovar.genome)
        if not np.all(genomes_in_serovar_table):
            missing_genomes = ', '.join([x for x in genome_names_series[~genomes_in_serovar_table]])
            logging.error('The following genomes were not found in the serovar table: %s', missing_genomes)
            raise Exception('Not all user provided genome FASTA files in the provided serovar table!')

        df_wklm = pd.read_csv(SEROVAR_TABLE_PATH)
        logging.info('Checking for non-standard serovar designations')
        serovars_not_in_wklm = df_serovar.serovar[~df_serovar.serovar.isin(df_wklm.Serovar)]
        for row_idx, serovar in serovars_not_in_wklm.iteritems():
            logging.warning('Non-standard serovar %s at row %s for genome %s!', serovar, row_idx,
                            df_serovar.ix[row_idx]['genome'])
    else:
        logging.warning('No genome to serovar table specified! Using SISTR serovar predictions')
        df_serovar = None

    if threads == 1:
        logging.info('Serial single threaded run mode on %s genomes', len(input_fastas))
        outputs = [run_sistr(input_fasta, tmp_dir) for input_fasta in input_fastas]
    else:
        from multiprocessing import Pool

        logging.info('Initializing thread pool with %s threads', threads)
        pool = Pool(processes=threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes', len(input_fastas))
        res = [pool.apply_async(run_sistr, (input_fasta, tmp_dir)) for input_fasta in input_fastas]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]
    # collect results from sistr analysis
    prediction_outputs = [x for x, y in outputs]
    cgmlst_results = [y for x, y in outputs]
    # create some output dirs
    data_outdir = create_subdirs(outdir, 'data')
    cgmlst_outdir = create_subdirs(outdir, 'data', 'cgmlst')
    sketch_outdir = create_subdirs(outdir, 'mash-sketches')
    # write files with new and old data
    cgmlst_fasta = write_cgmlst_fasta(cgmlst_outdir, cgmlst_results)
    write_cgmlst_profiles_csv(cgmlst_outdir, cgmlst_results, genome_names)
    write_serovar_and_spp_tables(data_outdir, df_serovar, prediction_outputs, genome_names)
    create_merge_mash_sketches(input_fastas, data_outdir, sketch_outdir)

    centroid_alleles_path = os.path.join(cgmlst_outdir, 'cgmlst-centroid.fasta')
    run_allele_reduction(cgmlst_fasta, centroid_alleles_path, threads=threads)

    logging.info('Done!')
Esempio n. 8
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.debug(args)
    input_fastas = args.fastas
    outdir = args.outdir
    tmp_dir = args.tmp_dir
    serovar_table_path = args.serovar_table
    threads = args.threads
    force = args.force

    assert len(input_fastas) > 0, 'No FASTA files specified!'
    for input_fasta in input_fastas:
        assert os.path.exists(
            input_fasta), 'Genome FASTA file does not exist at "{}"'.format(
                input_fasta)
    genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    logging.info(
        'You have specified %s genomes to add to current sistr_cmd data files! %s',
        len(genome_names), genome_names)

    if os.path.exists(outdir):
        if not force:
            raise Exception(
                'Output directory already exists at {}!'.format(outdir))
        else:
            logging.warning('Using existing output directory at %s', outdir)
    try:
        os.makedirs(outdir)
    except:
        pass
    assert os.path.exists(outdir), 'Output directory could not be created!'

    if serovar_table_path:
        assert os.path.exists(
            serovar_table_path
        ), 'Provided serovar table path does not exist! {}'.format(
            serovar_table_path)
        logging.info('Parsing serovar table from "%s"', serovar_table_path)
        if re.match(r'.*.csv$', serovar_table_path):
            logging.info('Trying to read serovar table "%s" as CSV',
                         serovar_table_path)
            df_serovar = pd.read_csv(serovar_table_path)
        else:
            logging.info('Trying to read serovar table "%s" as tab-delimited',
                         serovar_table_path)
            df_serovar = pd.read_table(serovar_table_path)
        expected_columns = ['genome', 'serovar']
        assert np.all(
            df_serovar.columns.isin(expected_columns)
        ), 'User serovar table did not contain expected columns {}'.format(
            expected_columns)
        if 'subspecies' not in df_serovar.columns:
            logging.warning(
                'User serovar table did not contain "subspecies" column so the sistr_cmd subspecies prediction will be used!'
            )
        genome_names_series = pd.Series(genome_names)
        genomes_in_serovar_table = genome_names_series.isin(df_serovar.genome)
        if not np.all(genomes_in_serovar_table):
            missing_genomes = ', '.join(
                [x for x in genome_names_series[~genomes_in_serovar_table]])
            logging.error(
                'The following genomes were not found in the serovar table: %s',
                missing_genomes)
            raise Exception(
                'Not all user provided genome FASTA files in the provided serovar table!'
            )

        df_wklm = pd.read_csv(SEROVAR_TABLE_PATH)
        logging.info('Checking for non-standard serovar designations')
        serovars_not_in_wklm = df_serovar.serovar[~df_serovar.serovar.
                                                  isin(df_wklm.Serovar)]
        for row_idx, serovar in serovars_not_in_wklm.iteritems():
            logging.warning('Non-standard serovar %s at row %s for genome %s!',
                            serovar, row_idx, df_serovar.ix[row_idx]['genome'])
    else:
        logging.warning(
            'No genome to serovar table specified! Using SISTR serovar predictions'
        )
        df_serovar = None

    if threads == 1:
        logging.info('Serial single threaded run mode on %s genomes',
                     len(input_fastas))
        outputs = [
            run_sistr(input_fasta, tmp_dir) for input_fasta in input_fastas
        ]
    else:
        from multiprocessing import Pool

        logging.info('Initializing thread pool with %s threads', threads)
        pool = Pool(processes=threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes',
                     len(input_fastas))
        res = [
            pool.apply_async(run_sistr, (input_fasta, tmp_dir))
            for input_fasta in input_fastas
        ]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]
    # collect results from sistr analysis
    prediction_outputs = [x for x, y in outputs]
    cgmlst_results = [y for x, y in outputs]
    # create some output dirs
    data_outdir = create_subdirs(outdir, 'data')
    cgmlst_outdir = create_subdirs(outdir, 'data', 'cgmlst')
    sketch_outdir = create_subdirs(outdir, 'mash-sketches')
    # write files with new and old data
    cgmlst_fasta = write_cgmlst_fasta(cgmlst_outdir, cgmlst_results)
    write_cgmlst_profiles_csv(cgmlst_outdir, cgmlst_results, genome_names)
    write_serovar_and_spp_tables(data_outdir, df_serovar, prediction_outputs,
                                 genome_names)
    create_merge_mash_sketches(input_fastas, data_outdir, sketch_outdir)

    centroid_alleles_path = os.path.join(cgmlst_outdir,
                                         'cgmlst-centroid.fasta')
    run_allele_reduction(cgmlst_fasta, centroid_alleles_path, threads=threads)

    logging.info('Done!')