Beispiel #1
0
def main():
    program_name = 'seq_typing.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, _, _, _, _ = python_arguments(program_name, __version__)
    args = parser.parse_args()

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(script_name=program_name,
                                            logfile=logfile,
                                            version=__version__,
                                            outdir=args.outdir,
                                            time_str=time_str)
    del script_path
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(prog='patho_typing.py',
                                     description='In silico pathogenic typing directly from raw Illumina reads',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version',
                        version='{prog} v{version}'.format(prog=parser.prog, version=__version__))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
                                 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
                                 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
                                      ' assumed as being the paired fastq files', required=True)
    parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
                                 help='Species name', required=True)

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
                                         help='Path to the directory where the information will be stored',
                                         required=False, default='.')
    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
                                         required=False, default=1)
    parser_optional_general.add_argument('--trueCoverage', action='store_true',
                                         help='Assess true coverage before continue typing')
    parser_optional_general.add_argument('--noCheckPoint', action='store_true',
                                         help='Ignore the true coverage checking point')
    parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
                                         help='Minimum typing percentage of target reference gene sequence covered to'
                                              ' consider a gene to be present (value between [0, 100])', required=False)
    parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
                                         help='Minimum typing percentage of identity of reference gene sequence covered'
                                              ' to consider a gene to be present (value between [0, 100]). One INDEL'
                                              ' will be considered as one difference', required=False)
    parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
                                         help='Minimum typing gene average coverage depth of present positions to'
                                              ' consider a gene to be present (default is 1/3 of average sample'
                                              ' coverage or 15x)', required=False)
    parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
                                         help='Do not remove ReMatCh consensus sequences')
    parser_optional_general.add_argument('--debug', action='store_true',
                                         help='DeBug Mode: do not remove temporary files')

    args = parser.parse_args()

    if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
        parser.error('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100):
        parser.error('--minGeneIdentity should be a value between [0, 100]')

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
    print('\n')

    rematch = include_rematch_dependencies_path()

    args.fastq = [fastq.name for fastq in args.fastq]

    reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
    typing_sequences, typing_headers, typing_rules, typing_config = \
        set_reference(args.species, args.outdir, script_path, args.trueCoverage)
    original_reference_file = str(reference_file)

    confirm_genes_fasta_rules(typing_headers, typing_rules)

    run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
    if run_successfully:
        rematch_dir = os.path.join(args.outdir, 'rematch', '')
        if not os.path.isdir(rematch_dir):
            os.makedirs(rematch_dir)

        if args.trueCoverage:
            if trueCoverage_file is not None:
                trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
                if not os.path.isdir(trueCoverage_dir):
                    os.makedirs(trueCoverage_dir)

                print('\n')
                run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
                                                               args.threads)
                if run_successfully:
                    run_successfully = indexAlignment(trueCoverage_bam)
                    if run_successfully:
                        reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
                        write_sequeces(reference_file, trueCoverage_sequences)
                        index_fasta_samtools(reference_file, None, None, True)
                        config = parse_config(trueCoverage_config)
                        runtime, run_successfully, sample_data_general, data_by_gene = \
                            run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
                                                    args.threads, config['length_extra_seq'],
                                                    config['minimum_depth_presence'], config['minimum_depth_call'],
                                                    config['minimum_depth_frequency_dominant_allele'],
                                                    config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                                    args.debug, args.doNotRemoveConsensus)

                        if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
                                sample_data_general['number_absent_genes'] is not None and \
                                sample_data_general['number_genes_multiple_alleles'] is not None:
                            if args.minGeneDepth is None:
                                args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                                    sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                                    15

                            exit_info = []
                            if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
                                exit_info.append('Sample coverage ({mean}) lower than the minimum'
                                                 ' required ({minimum})'
                                                 ''.format(mean=sample_data_general['mean_sample_coverage'],
                                                           minimum=config['minimum_read_coverage']))
                            if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
                                exit_info.append('Number of absent genes ({number}) higher than the'
                                                 ' maximum allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_absent_genes'],
                                                           maximum=config['maximum_number_absent_genes']))
                            if sample_data_general['number_genes_multiple_alleles'] > \
                                    config['maximum_number_genes_multiple_alleles']:
                                exit_info.append('Number of genes with multiple alleles'
                                                 ' ({number}) higher than the maximum'
                                                 ' allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
                                                           maximum=config['maximum_number_genes_multiple_alleles']))

                            if len(exit_info) > 0:
                                print('\n' + '\n'.join(exit_info) + '\n')
                                e = 'TrueCoverage requirements not fulfilled'
                                print('\n' + e + '\n')
                                if not args.noCheckPoint:
                                    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                    _ = utils.runTime(start_time)
                                    sys.exit(e)
                        else:
                            e = 'TrueCoverage module did not run successfully'
                            print('\n' + e + '\n')
                            if not args.noCheckPoint:
                                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                _ = utils.runTime(start_time)
                                sys.exit(e)

                        print('\n')
                        typing_dir = os.path.join(rematch_dir, 'typing', '')
                        if not os.path.isdir(typing_dir):
                            os.makedirs(typing_dir)
                        run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
                        if run_successfully:
                            run_successfully = indexAlignment(bam_file)
                            if run_successfully:
                                reference_file = os.path.join(typing_dir, 'reference.fasta')
                                write_sequeces(reference_file, typing_sequences)
                                index_fasta_samtools(reference_file, None, None, True)
                                rematch_dir = str(typing_dir)
                if not run_successfully:
                    if args.noCheckPoint:
                        clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                        _ = utils.runTime(start_time)
                        sys.exit('Something in the required TrueCoverage analysis went wrong')
            else:
                print('\n'
                      'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
                      '\n')

        if run_successfully:
            config = parse_config(typing_config)
            if args.minGeneCoverage is not None:
                config['minimum_gene_coverage'] = args.minGeneCoverage
            if args.minGeneIdentity is not None:
                config['minimum_gene_identity'] = args.minGeneIdentity

            runtime, run_successfully, sample_data_general, data_by_gene = \
                run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
                                        config['length_extra_seq'], config['minimum_depth_presence'],
                                        config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
                                        config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                        args.debug, args.doNotRemoveConsensus)
            if run_successfully and data_by_gene is not None:
                if args.minGeneDepth is None:
                    args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                        sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                        15

                _, _, _ = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'],
                                        config['minimum_gene_identity'], args.minGeneDepth, args.outdir)
            else:
                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                _ = utils.runTime(start_time)
                sys.exit('ReMatCh run for pathotyping did not run successfully')
        else:
            clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
            _ = utils.runTime(start_time)
            sys.exit('Something did not run successfully')

    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)

    print('\n')
    _ = utils.runTime(start_time)
Beispiel #3
0
def runRematch(args):
    workdir = os.path.abspath(args.workdir)
    if not os.path.isdir(workdir):
        os.makedirs(workdir)

    asperaKey = os.path.abspath(
        args.asperaKey.name) if args.asperaKey is not None else None

    # Start logger
    logfile, time_str = utils.start_logger(workdir)

    # Get general information
    utils.general_information(logfile, version, workdir, time_str,
                              args.doNotUseProvidedSoftware, asperaKey,
                              args.downloadCramBam)

    # Set listIDs
    listIDs, searched_fastq_files = getListIDs(
        workdir, args.listIDs.name if args.listIDs is not None else None,
        args.taxon)

    # Run ReMatCh for each sample
    print '\n' + 'STARTING ReMatCh' + '\n'

    # Clean sequences headers
    reference_file, gene_list_reference = clean_headers_reference_file(
        os.path.abspath(args.reference.name), workdir, args.extraSeq)

    if len(gene_list_reference) == 0:
        sys.exit('No sequences left')

    # To use in combined report

    number_samples_successfully = 0
    for sample in listIDs:
        sample_start_time = time.time()
        print '\n\n' + 'Sample ID: ' + sample

        # Create sample outdir
        sample_outdir = os.path.join(workdir, sample, '')
        if not os.path.isdir(sample_outdir):
            os.mkdir(sample_outdir)

        run_successfully_fastq = None
        time_taken_fastq = 0
        sequencingInformation = {
            'run_accession': None,
            'instrument_platform': None,
            'instrument_model': None,
            'library_layout': None,
            'library_source': None,
            'extra_run_accession': None,
            'date_download': None
        }
        if not searched_fastq_files:
            # Download Files
            time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload(
                sample, args.downloadLibrariesType, asperaKey, sample_outdir,
                args.downloadCramBam, args.threads,
                args.downloadInstrumentPlatform)
        else:
            fastq_files = listIDs[sample]

        fileSize = None

        run_successfully_rematch_first = None
        run_successfully_rematch_second = None
        time_taken_rematch_first = 0
        time_taken_rematch_second = 0
        if run_successfully_fastq is not False:
            fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)
            # Run ReMatCh
            time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files = rematch_module.runRematchModule(
                sample, fastq_files, reference_file, args.threads,
                sample_outdir, args.extraSeq, args.minCovPresence,
                args.minCovCall, args.minFrequencyDominantAllele,
                args.minGeneCoverage, args.conservedSeq, args.debug,
                args.numMapLoc, args.minGeneIdentity)
            if run_successfully_rematch_first:
                write_data_by_gene(gene_list_reference, args.minGeneCoverage,
                                   sample, data_by_gene, workdir, time_str,
                                   'first_run', args.minGeneIdentity)
                if args.doubleRun:
                    rematch_second_outdir = os.path.join(
                        sample_outdir, 'rematch_second_run', '')
                    if not os.path.isdir(rematch_second_outdir):
                        os.mkdir(rematch_second_outdir)
                    consensus_concatenated_fasta, consensus_concatenated_gene_list = concatenate_extraSeq_2_consensus(
                        consensus_files['noMatter'], reference_file,
                        args.extraSeq, rematch_second_outdir)
                    if len(consensus_concatenated_gene_list) > 0:
                        time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files = rematch_module.runRematchModule(
                            sample, fastq_files, consensus_concatenated_fasta,
                            args.threads, rematch_second_outdir, args.extraSeq,
                            args.minCovPresence, args.minCovCall,
                            args.minFrequencyDominantAllele,
                            args.minGeneCoverage, args.conservedSeq,
                            args.debug, args.numMapLoc, args.minGeneIdentity)
                        if not args.debug:
                            os.remove(consensus_concatenated_fasta)
                        if run_successfully_rematch_second:
                            write_data_by_gene(gene_list_reference,
                                               args.minGeneCoverage, sample,
                                               data_by_gene, workdir, time_str,
                                               'second_run',
                                               args.minGeneIdentity)
                    else:
                        print 'No sequences left after ReMatCh module first run. Second run will not be performed'

        if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None:
            for fastq in fastq_files:
                if os.path.isfile(fastq):
                    os.remove(fastq)

        time_taken = utils.runTime(sample_start_time)

        write_sample_report(
            sample, workdir, time_str, fileSize, run_successfully_fastq,
            run_successfully_rematch_first, run_successfully_rematch_second,
            time_taken_fastq, time_taken_rematch_first,
            time_taken_rematch_second, time_taken, sequencingInformation,
            sample_data_general_first if run_successfully_rematch_first else {
                'number_absent_genes': None,
                'number_genes_multiple_alleles': None,
                'mean_sample_coverage': None
            }, sample_data_general_second
            if run_successfully_rematch_second else {
                'number_absent_genes': None,
                'number_genes_multiple_alleles': None,
                'mean_sample_coverage': None
            }, fastq_files if fastq_files is not None else '')

        if all([
                run_successfully_fastq is not False,
                run_successfully_rematch_first is not False,
                run_successfully_rematch_second is not False
        ]):
            number_samples_successfully += 1

    return number_samples_successfully, len(listIDs)
Beispiel #4
0
def run_rematch(args):
    workdir = os.path.abspath(args.workdir)
    if not os.path.isdir(workdir):
        os.makedirs(workdir)

    aspera_key = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None

    # Start logger
    logfile, time_str = utils.start_logger(workdir)

    # Get general information
    script_path = utils.general_information(logfile, __version__, workdir, time_str, args.doNotUseProvidedSoftware,
                                            aspera_key, args.downloadCramBam, args.SRA, args.SRAopt)

    # Set list_ids
    list_ids, searched_fastq_files = get_list_ids(workdir, args.listIDs.name if args.listIDs is not None else None,
                                                  args.taxon)

    mlst_sequences = None
    mlst_dicts = None
    if args.mlst is not None:
        time_taken_pub_mlst, mlst_dicts, mlst_sequences = check_mlst.download_pub_mlst_xml(args.mlst,
                                                                                           args.mlstSchemaNumber,
                                                                                           workdir)
        args.softClip_recodeRun = 'first'

    if args.reference is None:
        if args.mlst is not None:
            reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path)
            args.extraSeq = 200
            if reference_file is None:
                print('It was not found provided MLST scheme sequences for ' + args.mlst)
                print('Trying to obtain reference MLST sequences from PubMLST')
                if len(mlst_sequences) > 0:
                    reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str)
                    args.extraSeq = 0
                else:
                    sys.exit('It was not possible to download MLST sequences from PubMLST!')
            else:
                print('Using provided scheme as referece: ' + reference_file)
        else:
            sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"')
    else:
        reference_file = os.path.abspath(args.reference.name)

    # Run ReMatCh for each sample
    print('\n' + 'STARTING ReMatCh' + '\n')

    # Clean sequences headers
    reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir,
                                                                                       args.extraSeq)

    if args.mlst is not None:
        problem_genes = False
        for header in mlst_sequences:
            if header not in gene_list_reference:
                print('MLST gene {header} not found between reference sequences'.format(header=header))
                problem_genes = True
        if problem_genes:
            sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!')

    if len(gene_list_reference) == 0:
        sys.exit('No sequences left')

    # To use in combined report

    number_samples_successfully = 0
    genes_present_coverage_depth = {}
    genes_present_sequence_coverage = {}
    for sample in list_ids:
        sample_start_time = time.time()
        print('\n\n' + 'Sample ID: ' + sample)

        # Create sample outdir
        sample_outdir = os.path.join(workdir, sample, '')
        if not os.path.isdir(sample_outdir):
            os.mkdir(sample_outdir)

        run_successfully_fastq = None
        time_taken_fastq = 0
        sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
                                  'library_layout': None, 'library_source': None, 'extra_run_accession': None,
                                  'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
        if not searched_fastq_files:
            # Download Files
            time_taken_fastq, run_successfully_fastq, fastq_files, sequencing_information = \
                download.run_download(sample, args.downloadLibrariesType, aspera_key, sample_outdir,
                                      args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA,
                                      args.SRAopt)
        else:
            fastq_files = list_ids[sample]

        file_size = None

        run_successfully_rematch_first = None
        run_successfully_rematch_second = None
        time_taken_rematch_first = 0
        time_taken_rematch_second = 0
        sample_data_general_first = None
        sample_data_general_second = None
        if run_successfully_fastq is not False:
            file_size = sum(os.path.getsize(fastq) for fastq in fastq_files)
            # Run ReMatCh
            time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, \
            consensus_files, consensus_sequences = \
                rematch_module.run_rematch_module(sample, fastq_files, reference_file, args.threads, sample_outdir,
                                                  args.extraSeq, args.minCovPresence, args.minCovCall,
                                                  args.minFrequencyDominantAllele, args.minGeneCoverage,
                                                  args.debug, args.numMapLoc, args.minGeneIdentity,
                                                  'first', args.softClip_baseQuality, args.softClip_recodeRun,
                                                  reference_dict, args.softClip_cigarFlagRecode,
                                                  args.bowtieAlgo, args.bowtieOPT,
                                                  gene_list_reference, args.notWriteConsensus, clean_run=True)
            if run_successfully_rematch_first:
                if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'):
                    run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str)
                genes_present_coverage_depth = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
                                                                  data_by_gene, workdir, time_str, 'first_run',
                                                                  args.minGeneIdentity, 'coverage_depth', args.summary,
                                                                  genes_present_coverage_depth)
                if args.reportSequenceCoverage:
                    genes_present_sequence_coverage = write_data_by_gene(gene_list_reference, args.minGeneCoverage,
                                                                         sample, data_by_gene, workdir, time_str,
                                                                         'first_run', args.minGeneIdentity,
                                                                         'sequence_coverage', args.summary,
                                                                         genes_present_sequence_coverage)
                if args.doubleRun:
                    rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '')
                    if not os.path.isdir(rematch_second_outdir):
                        os.mkdir(rematch_second_outdir)
                    consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict, \
                    number_consensus_with_sequences = \
                        concatenate_extra_seq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq,
                                                          rematch_second_outdir)
                    if len(consensus_concatenated_gene_list) > 0:
                        if args.mlst is None or \
                                (args.mlst is not None and number_consensus_with_sequences == len(gene_list_reference)):
                            time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, \
                            sample_data_general_second, consensus_files, consensus_sequences = \
                                rematch_module.run_rematch_module(sample, fastq_files, consensus_concatenated_fasta,
                                                                  args.threads, rematch_second_outdir, args.extraSeq,
                                                                  args.minCovPresence, args.minCovCall,
                                                                  args.minFrequencyDominantAllele, args.minGeneCoverage,
                                                                  args.debug, args.numMapLoc,
                                                                  args.minGeneIdentity, 'second',
                                                                  args.softClip_baseQuality, args.softClip_recodeRun,
                                                                  consensus_concatenated_dict,
                                                                  args.softClip_cigarFlagRecode,
                                                                  args.bowtieAlgo, args.bowtieOPT,
                                                                  gene_list_reference, args.notWriteConsensus,
                                                                  clean_run=True)
                            if not args.debug:
                                os.remove(consensus_concatenated_fasta)
                            if run_successfully_rematch_second:
                                if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'):
                                    run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second',
                                               workdir, time_str)
                                _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene,
                                                       workdir, time_str, 'second_run', args.minGeneIdentity,
                                                       'coverage_depth', False, {})
                                if args.reportSequenceCoverage:
                                    _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
                                                           data_by_gene, workdir, time_str, 'second_run',
                                                           args.minGeneIdentity, 'sequence_coverage', False, {})
                        else:
                            print('Some sequences missing after ReMatCh module first run. Second run will not be'
                                  ' performed')
                            if os.path.isfile(consensus_concatenated_fasta):
                                os.remove(consensus_concatenated_fasta)
                            if os.path.isdir(rematch_second_outdir):
                                utils.remove_directory(rematch_second_outdir)
                    else:
                        print('No sequences left after ReMatCh module first run. Second run will not be performed')
                        if os.path.isfile(consensus_concatenated_fasta):
                            os.remove(consensus_concatenated_fasta)
                        if os.path.isdir(rematch_second_outdir):
                            utils.remove_directory(rematch_second_outdir)

        if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None:
            for fastq in fastq_files:
                if os.path.isfile(fastq):
                    os.remove(fastq)

        time_taken = utils.run_time(sample_start_time)

        write_sample_report(sample, workdir, time_str, file_size, run_successfully_fastq,
                            run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq,
                            time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencing_information,
                            sample_data_general_first if run_successfully_rematch_first else
                            {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
                             'mean_sample_coverage': None},
                            sample_data_general_second if run_successfully_rematch_second else
                            {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
                             'mean_sample_coverage': None},
                            fastq_files if fastq_files is not None else '')

        if all([run_successfully_fastq is not False,
                run_successfully_rematch_first is not False,
                run_successfully_rematch_second is not False]):
            number_samples_successfully += 1

    if args.summary:
        write_summary_report(workdir, 'coverage_depth', time_str, gene_list_reference, genes_present_coverage_depth)
        if args.reportSequenceCoverage:
            write_summary_report(workdir, 'sequence_coverage', time_str, gene_list_reference,
                                 genes_present_sequence_coverage)

    return number_samples_successfully, len(list_ids)
Beispiel #5
0
def main():
    program_name = 'ecoli_stx_subtyping.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, parser_reads, _, parser_assembly, _ = python_arguments(
        program_name=program_name, version=version)
    parser.description = 'Gets E. coli stx subtypes'

    # Add specific arguments
    parser_reads.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_reads.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    parser_assembly.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_assembly.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    args = parser.parse_args()

    msg = []
    if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
        msg.append('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
        msg.append('--minGeneIdentity should be a value between [0, 100]')
    if args.stx2covered < 0 or args.stx2covered > 100:
        msg.append('--stx2covered should be a value between [0, 100]')
    if args.stx2identity < 0 or args.stx2identity > 100:
        msg.append('--stx2identity should be a value between [0, 100]')
    if args.org != ['stx', 'subtyping']:
        msg.append('Use "--org stx subtyping" with {}'.format(program_name))

    if len(msg) > 0:
        argparse.ArgumentParser(prog='{} options'.format(program_name)).error(
            '\n'.join(msg))

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    _ = utils.general_information(script_name=program_name,
                                  logfile=logfile,
                                  version=version,
                                  outdir=args.outdir,
                                  time_str=time_str)
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    stx1_result, stx2_result = stx_subtype_parser(
        os.path.join(args.outdir, 'seq_typing.report_types.tab'), [
            ref_file for ref_file in reference
            if 'stx1' in os.path.basename(ref_file).lower()
        ][0], [
            ref_file for ref_file in reference
            if 'stx2' in os.path.basename(ref_file).lower()
        ][0], args.stx2covered, args.stx2identity)

    # Rename the file to keep ecoli_stx_subtyping stamp
    if os.path.isfile(os.path.join(args.outdir,
                                   'seq_typing.report_types.tab')):
        os.rename(
            os.path.join(args.outdir, 'seq_typing.report_types.tab'),
            os.path.join(args.outdir,
                         'seq_typing.ecoli_stx_subtyping.report_types.tab'))

    # Remove the file to only keep the ecoli_stx_subtyping one
    if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report.txt')):
        os.remove(os.path.join(args.outdir, 'seq_typing.report.txt'))

    print('\n'
          'E. coli stx_subtyping - {stx1_result}:{stx2_result}\n'
          '\n'.format(stx1_result=stx1_result, stx2_result=stx2_result))
    with open(os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.txt'),
              'wt') as writer:
        writer.write(':'.join([stx1_result, stx2_result]))

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)