def main():
    parser = init_arg_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    input_fasta = args.input
    if not os.path.exists(input_fasta):
        err_msg = 'Input fasta file does not exist at {}. cgMLST fasta file required!'.format(
            input_fasta)
        logging.error(err_msg)
        raise Exception(err_msg)
    output_path = args.output
    if output_path is None:
        input_fasta_dir = os.path.dirname(input_fasta)
        input_filename = os.path.basename(input_fasta)
        output_filename = re.sub(r'(\.fasta$)|(\.\w{1,}$)', '',
                                 input_filename) + '-centroid.fasta'
        output_path = os.path.join(input_fasta_dir, output_filename)
        logging.info('No output path specified. Using "%s"', output_path)
    if os.path.exists(output_path):
        err_msg = 'File already exists at the output path "{}"! Specify a different output path.'.format(
            output_path)
        logging.error(err_msg)
        raise Exception(err_msg)

    run_allele_reduction(input_fasta,
                         output_path,
                         threads=args.threads,
                         word_size=args.word_size,
                         cluster_threshold=args.cluster_threshold)
Esempio n. 2
0
def main():
    parser = init_arg_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    input_profiles = args.input
    if not os.path.exists(input_profiles):
        err_msg = 'Input cgMLST profiles file does not exist at {}. cgMLST profiles file required!'.format(input_profiles)
        logging.error(err_msg)
        raise Exception(err_msg)
    output_path = args.output
    if os.path.exists(output_path):
        err_msg = 'File already exists at the output path "{}"! Specify a different output path.'.format(output_path)
        logging.error(err_msg)
        raise Exception(err_msg)

    begin_threshold = args.begin_threshold
    end_threshold = args.end_threshold
    step_threshold = args.step_threshold
    assert begin_threshold >= 0.0, "-b/--begin-threshold must be positive number!"
    assert begin_threshold <= 1.0, "-b/--begin-threshold must be less than or equal to 1.0"
    assert begin_threshold < end_threshold, "-b/--begin-threshold must be less than -e/--end-threshold"
    assert end_threshold >= 0.0, "-e/--end-threshold must be positive number!"
    assert end_threshold <= 1.0, "-e/--end-threshold must be less than or equal to 1.0"
    assert step_threshold <= 1.0, "-s/--step-threshold must be less than or equal to 1.0"
    assert step_threshold >= 0.0, "-s/--step-threshold must be positive number!"

    logging.info('Reading profiles from %s', input_profiles)
    profiles_matrix, genomes, markers = profiles_to_np_array(input_profiles)
    logging.info('Profiles matrix shape: %s', profiles_matrix.shape)
    logging.debug('Genomes: %s', genomes)
    logging.debug('Markers: %s', markers)
    logging.info('Finding non-redundant profiles. Grouping genomes by distinct profiles.')
    nr_profiles_matrix, genome_groups = nr_profiles(profiles_matrix, genomes)
    logging.info('Non-redundant profiles matrix shape: %s', nr_profiles_matrix.shape)
    logging.debug('Genome groups: %s', genome_groups)
    logging.info('Computing Hamming distance matrix from profiles')
    dm = dist_matrix_hamming(nr_profiles_matrix)
    logging.info('Complete linkage of Hamming distance matrix')
    hc = complete_linkage(dm)
    thresholds = np.arange(begin_threshold, end_threshold + step_threshold, step_threshold)
    logging.info('Generating flat clusters from %s to %s with step %s',
                 begin_threshold,
                 end_threshold,
                 step_threshold)
    logging.debug('Thresholds: %s', thresholds)
    df_clusters = cutree(hc, thresholds)
    logging.info('Flat clusters generated for non-redundant profiles. Expanding to all profiles.')
    logging.debug('df_clusters: %s', df_clusters)
    df_clusters = expand_clusters_dataframe(df_clusters, genome_groups)
    df_clusters.to_csv(output_path)
    logging.info('HC flat clusters written to "%s"', output_path)
    logging.info('Done!')
Esempio n. 3
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.info('Running sistr_cmd {}'.format(__version__))
    input_fastas = args.fastas
    paths_names = args.input_fasta_genome_name
    if len(input_fastas) == 0 and (paths_names is None
                                   or len(paths_names) == 0):
        raise Exception('No FASTA files specified!')
    if paths_names is None:
        genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    else:
        if len(input_fastas) == 0 and len(paths_names) > 0:
            input_fastas = [x for x, y in paths_names]
            genome_names = [y for x, y in paths_names]
        elif len(input_fastas) > 0 and len(paths_names) > 0:
            tmp = input_fastas
            input_fastas = [x for x, y in paths_names] + tmp
            genome_names = [y for x, y in paths_names
                            ] + [genome_name_from_fasta_path(x) for x in tmp]
        else:
            raise Exception(
                'Unhandled fasta input args: input_fastas="{}" | input_fasta_genome_name="{}"'
                .format(input_fastas, paths_names))

    tmp_dir = args.tmp_dir
    keep_tmp = args.keep_tmp
    output_format = args.output_format
    output_path = args.output_prediction

    n_threads = args.threads
    if n_threads == 1:
        logging.info('Serial single threaded run mode on %s genomes',
                     len(input_fastas))
        outputs = [
            sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args)
            for input_fasta, genome_name in zip(input_fastas, genome_names)
        ]
    else:
        from multiprocessing import Pool
        logging.info('Initializing thread pool with %s threads', n_threads)
        pool = Pool(processes=n_threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes',
                     len(input_fastas))
        res = [
            pool.apply_async(
                sistr_predict,
                (input_fasta, genome_name, tmp_dir, keep_tmp, args))
            for input_fasta, genome_name in zip(input_fastas, genome_names)
        ]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]

    prediction_outputs = [x for x, y in outputs]
    cgmlst_results = [y for x, y in outputs]

    if output_path:
        from sistr.src.writers import write
        logging.info('Writing results with %s verbosity', args.more_results)
        write(output_path,
              output_format,
              prediction_outputs,
              more_results=args.more_results)
    else:
        import json
        from sistr.src.writers import to_dict
        logging.warning(
            'No prediction results output file written! Writing results summary to stdout as JSON'
        )
        exclude_keys_in_output = {'blast_results', 'sseq'}
        if args.more_results >= 2:
            exclude_keys_in_output.remove('blast_results')
            exclude_keys_in_output.remove('sseq')
        elif args.more_results == 1:
            exclude_keys_in_output.remove('sseq')
        outs = [
            to_dict(x, 0, exclude_keys=exclude_keys_in_output)
            for x in prediction_outputs
        ]
        print(json.dumps(outs))
    if args.cgmlst_profiles:
        write_cgmlst_profiles(genome_names, cgmlst_results,
                              args.cgmlst_profiles)
        logging.info('cgMLST allelic profiles written to %s',
                     args.cgmlst_profiles)
    if args.alleles_output:
        write_cgmlst_results_json(genome_names, cgmlst_results,
                                  args.alleles_output)
        logging.info(
            'JSON of allele data written to %s for %s cgMLST allele results',
            args.alleles_output, len(cgmlst_results))
    if args.novel_alleles:
        count = write_novel_alleles(cgmlst_results, args.novel_alleles)
        logging.info('Wrote %s alleles to %s', count, args.novel_alleles)
Esempio n. 4
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.info('Running sistr_cmd {}'.format(__version__))
    input_fastas = args.fastas
    paths_names = args.input_fasta_genome_name
    if len(input_fastas) == 0 and (paths_names is None or len(paths_names) == 0):
        raise Exception('No FASTA files specified!')
    if paths_names is None:
        genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    else:
        if len(input_fastas) == 0 and len(paths_names) > 0:
            input_fastas = [x for x,y in paths_names]
            genome_names = [y for x,y in paths_names]
        elif len(input_fastas) > 0 and len(paths_names) > 0:
            tmp = input_fastas
            input_fastas = [x for x,y in paths_names] + tmp
            genome_names = [y for x,y in paths_names] + [genome_name_from_fasta_path(x) for x in tmp]
        else:
            raise Exception('Unhandled fasta input args: input_fastas="{}" | input_fasta_genome_name="{}"'.format(
                input_fastas,
                paths_names))

    tmp_dir = args.tmp_dir
    keep_tmp = args.keep_tmp
    output_format = args.output_format
    output_path = args.output_prediction

    n_threads = args.threads
    if n_threads == 1:
        logging.info('Serial single threaded run mode on %s genomes', len(input_fastas))
        outputs = [sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args) for input_fasta, genome_name in zip(input_fastas, genome_names)]
    else:
        from multiprocessing import Pool
        logging.info('Initializing thread pool with %s threads', n_threads)
        pool = Pool(processes=n_threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes', len(input_fastas))
        res = [pool.apply_async(sistr_predict, (input_fasta, genome_name, tmp_dir, keep_tmp, args)) for input_fasta, genome_name in zip(input_fastas, genome_names)]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]

    prediction_outputs = [x for x,y in outputs]
    cgmlst_results = [y for x,y in outputs]

    if output_path:
        from sistr.src.writers import write
        logging.info('Writing results with %s verbosity',
                     args.more_results)
        write(output_path, output_format, prediction_outputs, more_results=args.more_results)
    else:
        import json
        from sistr.src.writers import to_dict
        logging.warning('No prediction results output file written! Writing results summary to stdout as JSON')
        exclude_keys_in_output = {'blast_results', 'sseq'}
        if args.more_results >= 2:
            exclude_keys_in_output.remove('blast_results')
            exclude_keys_in_output.remove('sseq')
        elif args.more_results == 1:
            exclude_keys_in_output.remove('sseq')
        outs = [to_dict(x, 0, exclude_keys=exclude_keys_in_output) for x in prediction_outputs]
        print(json.dumps(outs))
    if args.cgmlst_profiles:
        write_cgmlst_profiles(genome_names, cgmlst_results, args.cgmlst_profiles)
        logging.info('cgMLST allelic profiles written to %s', args.cgmlst_profiles)
    if args.alleles_output:
        write_cgmlst_results_json(genome_names, cgmlst_results, args.alleles_output)
        logging.info('JSON of allele data written to %s for %s cgMLST allele results', args.alleles_output, len(cgmlst_results))
    if args.novel_alleles:
        count = write_novel_alleles(cgmlst_results, args.novel_alleles)
        logging.info('Wrote %s alleles to %s', count, args.novel_alleles)
Esempio n. 5
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.debug(args)
    input_fastas = args.fastas
    outdir = args.outdir
    tmp_dir = args.tmp_dir
    serovar_table_path = args.serovar_table
    threads = args.threads
    force = args.force

    assert len(input_fastas) > 0, 'No FASTA files specified!'
    for input_fasta in input_fastas:
        assert os.path.exists(input_fasta), 'Genome FASTA file does not exist at "{}"'.format(input_fasta)
    genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    logging.info('You have specified %s genomes to add to current sistr_cmd data files! %s',
                 len(genome_names),
                 genome_names)

    if os.path.exists(outdir):
        if not force:
            raise Exception('Output directory already exists at {}!'.format(outdir))
        else:
            logging.warning('Using existing output directory at %s', outdir)
    try:
        os.makedirs(outdir)
    except:
        pass
    assert os.path.exists(outdir), 'Output directory could not be created!'

    if serovar_table_path:
        assert os.path.exists(serovar_table_path), 'Provided serovar table path does not exist! {}'.format(
            serovar_table_path)
        logging.info('Parsing serovar table from "%s"', serovar_table_path)
        if re.match(r'.*.csv$', serovar_table_path):
            logging.info('Trying to read serovar table "%s" as CSV', serovar_table_path)
            df_serovar = pd.read_csv(serovar_table_path)
        else:
            logging.info('Trying to read serovar table "%s" as tab-delimited', serovar_table_path)
            df_serovar = pd.read_table(serovar_table_path)
        expected_columns = ['genome', 'serovar']
        assert np.all(
            df_serovar.columns.isin(expected_columns)), 'User serovar table did not contain expected columns {}'.format(
            expected_columns)
        if 'subspecies' not in df_serovar.columns:
            logging.warning(
                'User serovar table did not contain "subspecies" column so the sistr_cmd subspecies prediction will be used!')
        genome_names_series = pd.Series(genome_names)
        genomes_in_serovar_table = genome_names_series.isin(df_serovar.genome)
        if not np.all(genomes_in_serovar_table):
            missing_genomes = ', '.join([x for x in genome_names_series[~genomes_in_serovar_table]])
            logging.error('The following genomes were not found in the serovar table: %s', missing_genomes)
            raise Exception('Not all user provided genome FASTA files in the provided serovar table!')

        df_wklm = pd.read_csv(SEROVAR_TABLE_PATH)
        logging.info('Checking for non-standard serovar designations')
        serovars_not_in_wklm = df_serovar.serovar[~df_serovar.serovar.isin(df_wklm.Serovar)]
        for row_idx, serovar in serovars_not_in_wklm.iteritems():
            logging.warning('Non-standard serovar %s at row %s for genome %s!', serovar, row_idx,
                            df_serovar.ix[row_idx]['genome'])
    else:
        logging.warning('No genome to serovar table specified! Using SISTR serovar predictions')
        df_serovar = None

    if threads == 1:
        logging.info('Serial single threaded run mode on %s genomes', len(input_fastas))
        outputs = [run_sistr(input_fasta, tmp_dir) for input_fasta in input_fastas]
    else:
        from multiprocessing import Pool

        logging.info('Initializing thread pool with %s threads', threads)
        pool = Pool(processes=threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes', len(input_fastas))
        res = [pool.apply_async(run_sistr, (input_fasta, tmp_dir)) for input_fasta in input_fastas]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]
    # collect results from sistr analysis
    prediction_outputs = [x for x, y in outputs]
    cgmlst_results = [y for x, y in outputs]
    # create some output dirs
    data_outdir = create_subdirs(outdir, 'data')
    cgmlst_outdir = create_subdirs(outdir, 'data', 'cgmlst')
    sketch_outdir = create_subdirs(outdir, 'mash-sketches')
    # write files with new and old data
    cgmlst_fasta = write_cgmlst_fasta(cgmlst_outdir, cgmlst_results)
    write_cgmlst_profiles_csv(cgmlst_outdir, cgmlst_results, genome_names)
    write_serovar_and_spp_tables(data_outdir, df_serovar, prediction_outputs, genome_names)
    create_merge_mash_sketches(input_fastas, data_outdir, sketch_outdir)

    centroid_alleles_path = os.path.join(cgmlst_outdir, 'cgmlst-centroid.fasta')
    run_allele_reduction(cgmlst_fasta, centroid_alleles_path, threads=threads)

    logging.info('Done!')
Esempio n. 6
0
def main():
    parser = init_parser()
    args = parser.parse_args()
    init_console_logger(args.verbose)
    logging.debug(args)
    input_fastas = args.fastas
    outdir = args.outdir
    tmp_dir = args.tmp_dir
    serovar_table_path = args.serovar_table
    threads = args.threads
    force = args.force

    assert len(input_fastas) > 0, 'No FASTA files specified!'
    for input_fasta in input_fastas:
        assert os.path.exists(
            input_fasta), 'Genome FASTA file does not exist at "{}"'.format(
                input_fasta)
    genome_names = [genome_name_from_fasta_path(x) for x in input_fastas]
    logging.info(
        'You have specified %s genomes to add to current sistr_cmd data files! %s',
        len(genome_names), genome_names)

    if os.path.exists(outdir):
        if not force:
            raise Exception(
                'Output directory already exists at {}!'.format(outdir))
        else:
            logging.warning('Using existing output directory at %s', outdir)
    try:
        os.makedirs(outdir)
    except:
        pass
    assert os.path.exists(outdir), 'Output directory could not be created!'

    if serovar_table_path:
        assert os.path.exists(
            serovar_table_path
        ), 'Provided serovar table path does not exist! {}'.format(
            serovar_table_path)
        logging.info('Parsing serovar table from "%s"', serovar_table_path)
        if re.match(r'.*.csv$', serovar_table_path):
            logging.info('Trying to read serovar table "%s" as CSV',
                         serovar_table_path)
            df_serovar = pd.read_csv(serovar_table_path)
        else:
            logging.info('Trying to read serovar table "%s" as tab-delimited',
                         serovar_table_path)
            df_serovar = pd.read_table(serovar_table_path)
        expected_columns = ['genome', 'serovar']
        assert np.all(
            df_serovar.columns.isin(expected_columns)
        ), 'User serovar table did not contain expected columns {}'.format(
            expected_columns)
        if 'subspecies' not in df_serovar.columns:
            logging.warning(
                'User serovar table did not contain "subspecies" column so the sistr_cmd subspecies prediction will be used!'
            )
        genome_names_series = pd.Series(genome_names)
        genomes_in_serovar_table = genome_names_series.isin(df_serovar.genome)
        if not np.all(genomes_in_serovar_table):
            missing_genomes = ', '.join(
                [x for x in genome_names_series[~genomes_in_serovar_table]])
            logging.error(
                'The following genomes were not found in the serovar table: %s',
                missing_genomes)
            raise Exception(
                'Not all user provided genome FASTA files in the provided serovar table!'
            )

        df_wklm = pd.read_csv(SEROVAR_TABLE_PATH)
        logging.info('Checking for non-standard serovar designations')
        serovars_not_in_wklm = df_serovar.serovar[~df_serovar.serovar.
                                                  isin(df_wklm.Serovar)]
        for row_idx, serovar in serovars_not_in_wklm.iteritems():
            logging.warning('Non-standard serovar %s at row %s for genome %s!',
                            serovar, row_idx, df_serovar.ix[row_idx]['genome'])
    else:
        logging.warning(
            'No genome to serovar table specified! Using SISTR serovar predictions'
        )
        df_serovar = None

    if threads == 1:
        logging.info('Serial single threaded run mode on %s genomes',
                     len(input_fastas))
        outputs = [
            run_sistr(input_fasta, tmp_dir) for input_fasta in input_fastas
        ]
    else:
        from multiprocessing import Pool

        logging.info('Initializing thread pool with %s threads', threads)
        pool = Pool(processes=threads)
        logging.info('Running SISTR analysis asynchronously on %s genomes',
                     len(input_fastas))
        res = [
            pool.apply_async(run_sistr, (input_fasta, tmp_dir))
            for input_fasta in input_fastas
        ]

        logging.info('Getting SISTR analysis results')
        outputs = [x.get() for x in res]
    # collect results from sistr analysis
    prediction_outputs = [x for x, y in outputs]
    cgmlst_results = [y for x, y in outputs]
    # create some output dirs
    data_outdir = create_subdirs(outdir, 'data')
    cgmlst_outdir = create_subdirs(outdir, 'data', 'cgmlst')
    sketch_outdir = create_subdirs(outdir, 'mash-sketches')
    # write files with new and old data
    cgmlst_fasta = write_cgmlst_fasta(cgmlst_outdir, cgmlst_results)
    write_cgmlst_profiles_csv(cgmlst_outdir, cgmlst_results, genome_names)
    write_serovar_and_spp_tables(data_outdir, df_serovar, prediction_outputs,
                                 genome_names)
    create_merge_mash_sketches(input_fastas, data_outdir, sketch_outdir)

    centroid_alleles_path = os.path.join(cgmlst_outdir,
                                         'cgmlst-centroid.fasta')
    run_allele_reduction(cgmlst_fasta, centroid_alleles_path, threads=threads)

    logging.info('Done!')