Example #1
0
def main():
    program_name = 'seq_typing.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, _, _, _, _ = python_arguments(program_name, __version__)
    args = parser.parse_args()

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(script_name=program_name,
                                            logfile=logfile,
                                            version=__version__,
                                            outdir=args.outdir,
                                            time_str=time_str)
    del script_path
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)
Example #2
0
def main():
    version = '3.1'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    if not args.noLog:
        sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    if not args.noLog:
        print '\n' + 'LOGFILE:'
        print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY:'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = get_trueCoverage_config(args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']

    # Java check first for java dependents check next
    if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)):
        # programs_version_dictionary['java'] = ['-version', '>=', '1.8']
        programs_version_dictionary['java'] = [None, '>=', '1.8']  # For OpenJDK compatibility
    missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    if not args.skipTrueCoverage or trueCoverage_config is not None:
        include_rematch_dependencies_path(args.doNotUseProvidedSoftware)
        programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2']
        programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1']
    if not (args.skipTrueCoverage and ((args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = ['-version', '==', '0.36']
    if args.runPear:
        programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10']
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not (args.skipPilon or args.skipSPAdes):
        programs_version_dictionary['pilon-1.18.jar'] = ['--version', '==', '1.18']
    if not (args.skipMLST or args.skipSPAdes):
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']

    # Set and print PATH variable
    utils.setPATHvariable(args, script_path)

    missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    rematch_script = None
    # ReMatCh path
    if not args.skipTrueCoverage:
        rematch_script = programs_version_dictionary['rematch.py'][3]

    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0
    number_samples_warning = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    species_genus, mlst_scheme_genus = None, None
    if not args.skipMLST and not args.skipSPAdes:
        scheme, species_genus, mlst_scheme_genus = mlst.getScheme(args.speciesExpected)
        # Print path to blastn
        mlst.getBlastPath()

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0 ** 2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB)

    # Run INNUca for each sample
    sample_report_json = {}
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue
        elif len(fastq_files) == 0:
            print 'No compressed fastq files were found. Continue to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus)

        # Save sample fail report
        utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report)

        # Save warning report
        write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report)

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        warning, json_pass_qc = utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            if warning:
                number_samples_warning += 1
            else:
                number_samples_pass += 1

        sample_report_json[sample] = {'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report}

    # Save combine_samples_reports
    combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples))

    # Save sample_report in json
    if args.json:
        import json
        with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer:
            json.dump(sample_report_json, writer)

    # Remove temporary folder with symlink to fastq files in case of --fastq use
    if args.inputDirectory is None and args.fastq is not None:
        utils.removeDirectory(os.path.join(inputDirectory, ''))

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(not_run_successfully=(len(samples) - number_samples_successfully))
    print '\n' + 'FAIL: {number_samples_fail} samples'.format(number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning))
    print '\n' + 'WARNING: {number_samples_warning} samples'.format(number_samples_warning=number_samples_warning)
    print '\n' + 'PASS: {number_samples_pass} samples'.format(number_samples_pass=number_samples_pass)
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
Example #3
0
def main():
    version = '3.1'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    if not args.noLog:
        sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    if not args.noLog:
        print '\n' + 'LOGFILE:'
        print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY:'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = get_trueCoverage_config(
        args.skipTrueCoverage,
        args.trueConfigFile.name if args.trueConfigFile is not None else None,
        args.speciesExpected, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']

    # Java check first for java dependents check next
    if not (args.skipFastQC and args.skipTrimmomatic and
            (args.skipPilon or args.skipSPAdes)):
        # programs_version_dictionary['java'] = ['-version', '>=', '1.8']
        programs_version_dictionary['java'] = [None, '>=', '1.8'
                                               ]  # For OpenJDK compatibility
    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    if not args.skipTrueCoverage or trueCoverage_config is not None:
        include_rematch_dependencies_path(args.doNotUseProvidedSoftware)
        programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2']
        programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1']
    if not (args.skipTrueCoverage and (
        (args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = [
            '-version', '==', '0.36'
        ]
    if args.runPear:
        programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10']
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not (args.skipPilon or args.skipSPAdes):
        programs_version_dictionary['pilon-1.18.jar'] = [
            '--version', '==', '1.18'
        ]
    if not (args.skipMLST or args.skipSPAdes):
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']

    # Set and print PATH variable
    utils.setPATHvariable(args, script_path)

    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary[
            'trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    rematch_script = None
    # ReMatCh path
    if not args.skipTrueCoverage:
        rematch_script = programs_version_dictionary['rematch.py'][3]

    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(
        args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir,
                                       'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0
    number_samples_warning = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    species_genus, mlst_scheme_genus = None, None
    if not args.skipMLST and not args.skipSPAdes:
        scheme, species_genus, mlst_scheme_genus = mlst.getScheme(
            args.speciesExpected)
        # Print path to blastn
        mlst.getBlastPath()

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0**2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory,
                                               args.threads,
                                               available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory,
                                                   args.threads,
                                                   available_memory_GB)

    # Run INNUca for each sample
    sample_report_json = {}
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(
            os.path.join(inputDirectory, sample, ''),
            pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue
        elif len(fastq_files) == 0:
            print 'No compressed fastq files were found. Continue to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(
            sample, sample_outdir, fastq_files, args, script_path, scheme,
            spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
            jarMaxMemory, trueCoverage_config, rematch_script, species_genus,
            mlst_scheme_genus)

        # Save sample fail report
        utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'),
                                run_report)

        # Save warning report
        write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'),
                             run_report)

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        warning, json_pass_qc = utils.write_sample_report(
            samples_report_path, sample, run_successfully, pass_qc, time_taken,
            fileSize, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            if warning:
                number_samples_warning += 1
            else:
                number_samples_pass += 1

        sample_report_json[sample] = {
            'run_successfully': run_successfully,
            'pass_qc': json_pass_qc,
            'modules_run_report': run_report
        }

    # Save combine_samples_reports
    combine_reports.combine_reports(outdir, outdir, args.json, time_str,
                                    len(samples))

    # Save sample_report in json
    if args.json:
        import json
        with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'),
                  'wt') as writer:
            json.dump(sample_report_json, writer)

    # Remove temporary folder with symlink to fastq files in case of --fastq use
    if args.inputDirectory is None and args.fastq is not None:
        utils.removeDirectory(os.path.join(inputDirectory, ''))

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(
        not_run_successfully=(len(samples) - number_samples_successfully))
    print '\n' + 'FAIL: {number_samples_fail} samples'.format(
        number_samples_fail=(len(samples) - number_samples_pass -
                             number_samples_warning))
    print '\n' + 'WARNING: {number_samples_warning} samples'.format(
        number_samples_warning=number_samples_warning)
    print '\n' + 'PASS: {number_samples_pass} samples'.format(
        number_samples_pass=number_samples_pass)
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
Example #4
0
def main():
    version = '2.0'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    print '\n' + 'LOGFILE:'
    print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY :'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Set and print PATH variable
    utils.setPATHvariable(args.doNotUseProvidedSoftware, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']
    if (not args.skipTrueCoverage
            or (not args.skipPilon and not args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not (args.skipFastQC and args.skipTrimmomatic and
            (args.skipPilon or args.skipSPAdes)):
        programs_version_dictionary['java'] = ['-version', '>=', '1.8']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = [
            '-version', '==', '0.36'
        ]
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not args.skipPilon and not args.skipSPAdes:
        programs_version_dictionary['pilon-1.18.jar'] = [
            '--version', '==', '1.18'
        ]
    if not args.skipMLST and not args.skipSPAdes:
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']
    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary[
            'trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    # Check if input directory exists with fastq files and store samples name that have fastq files
    inputDirectory = os.path.abspath(os.path.join(args.inputDirectory, ''))
    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    print ''
    samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory(
        inputDirectory, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir,
                                       'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    if not args.skipMLST and not args.skipSPAdes:
        scheme = mlst.getScheme(args.speciesExpected)

    # Get path to blastn
    mlst.getBlastPath()

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = None
    if not args.skipTrueCoverage:
        trueCoverage_reference = None
        trueCoverage_config_file = None
        trueCoverage_config = None

        if args.trueConfigFile is None:
            print 'No trueCoverage_ReMatCh config file was provided. Search for default files'
            trueCoverage_config_file, trueCoverage_reference = trueCoverage.check_existing_default_config(
                args.speciesExpected, script_path)
        else:
            trueCoverage_config_file = args.trueConfigFile.name

        if trueCoverage_config_file is not None:
            trueCoverage_config = trueCoverage.parse_config(
                trueCoverage_config_file)
        if args.trueConfigFile is None and trueCoverage_config is not None:
            trueCoverage_config['reference_file'] = trueCoverage_reference

        if trueCoverage_config is not None:
            print 'The following trueCoverage_ReMatCh config file will be used: ' + trueCoverage_config_file
            print 'The following trueCoverage_ReMatCh reference file will be used: ' + trueCoverage_config[
                'reference_file'] + '\n'
        else:
            print 'No trueCoverage_ReMatCh config file was found'

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0**2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory,
                                               args.threads,
                                               available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory,
                                                   args.threads,
                                                   available_memory_GB)

    # Run INNUca for each sample
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(
            os.path.join(inputDirectory, sample, ''),
            pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(
            sample, sample_outdir, fastq_files, args, script_path, scheme,
            spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
            jarMaxMemory, trueCoverage_config)

        # Save sample fail report
        fail_report_path = os.path.join(sample_outdir, 'fail_report.txt')
        utils.write_fail_report(fail_report_path, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            number_samples_pass += 1

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        utils.write_sample_report(samples_report_path, sample,
                                  run_successfully, pass_qc, time_taken,
                                  fileSize, run_report)

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + str(number_samples_successfully) + ' samples out of ' + str(
        len(samples)) + ' run successfully'
    print '\n' + str(number_samples_pass) + ' samples out of ' + str(
        number_samples_successfully
    ) + ' (run successfully) PASS INNUca.py analysis'
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
Example #5
0
def main():
    parser = argparse.ArgumentParser(prog='patho_typing.py',
                                     description='In silico pathogenic typing directly from raw Illumina reads',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version',
                        version='{prog} v{version}'.format(prog=parser.prog, version=__version__))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
                                 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
                                 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
                                      ' assumed as being the paired fastq files', required=True)
    parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
                                 help='Species name', required=True)

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
                                         help='Path to the directory where the information will be stored',
                                         required=False, default='.')
    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
                                         required=False, default=1)
    parser_optional_general.add_argument('--trueCoverage', action='store_true',
                                         help='Assess true coverage before continue typing')
    parser_optional_general.add_argument('--noCheckPoint', action='store_true',
                                         help='Ignore the true coverage checking point')
    parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
                                         help='Minimum typing percentage of target reference gene sequence covered to'
                                              ' consider a gene to be present (value between [0, 100])', required=False)
    parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
                                         help='Minimum typing percentage of identity of reference gene sequence covered'
                                              ' to consider a gene to be present (value between [0, 100]). One INDEL'
                                              ' will be considered as one difference', required=False)
    parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
                                         help='Minimum typing gene average coverage depth of present positions to'
                                              ' consider a gene to be present (default is 1/3 of average sample'
                                              ' coverage or 15x)', required=False)
    parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
                                         help='Do not remove ReMatCh consensus sequences')
    parser_optional_general.add_argument('--debug', action='store_true',
                                         help='DeBug Mode: do not remove temporary files')

    args = parser.parse_args()

    if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
        parser.error('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100):
        parser.error('--minGeneIdentity should be a value between [0, 100]')

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
    print('\n')

    rematch = include_rematch_dependencies_path()

    args.fastq = [fastq.name for fastq in args.fastq]

    reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
    typing_sequences, typing_headers, typing_rules, typing_config = \
        set_reference(args.species, args.outdir, script_path, args.trueCoverage)
    original_reference_file = str(reference_file)

    confirm_genes_fasta_rules(typing_headers, typing_rules)

    run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
    if run_successfully:
        rematch_dir = os.path.join(args.outdir, 'rematch', '')
        if not os.path.isdir(rematch_dir):
            os.makedirs(rematch_dir)

        if args.trueCoverage:
            if trueCoverage_file is not None:
                trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
                if not os.path.isdir(trueCoverage_dir):
                    os.makedirs(trueCoverage_dir)

                print('\n')
                run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
                                                               args.threads)
                if run_successfully:
                    run_successfully = indexAlignment(trueCoverage_bam)
                    if run_successfully:
                        reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
                        write_sequeces(reference_file, trueCoverage_sequences)
                        index_fasta_samtools(reference_file, None, None, True)
                        config = parse_config(trueCoverage_config)
                        runtime, run_successfully, sample_data_general, data_by_gene = \
                            run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
                                                    args.threads, config['length_extra_seq'],
                                                    config['minimum_depth_presence'], config['minimum_depth_call'],
                                                    config['minimum_depth_frequency_dominant_allele'],
                                                    config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                                    args.debug, args.doNotRemoveConsensus)

                        if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
                                sample_data_general['number_absent_genes'] is not None and \
                                sample_data_general['number_genes_multiple_alleles'] is not None:
                            if args.minGeneDepth is None:
                                args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                                    sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                                    15

                            exit_info = []
                            if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
                                exit_info.append('Sample coverage ({mean}) lower than the minimum'
                                                 ' required ({minimum})'
                                                 ''.format(mean=sample_data_general['mean_sample_coverage'],
                                                           minimum=config['minimum_read_coverage']))
                            if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
                                exit_info.append('Number of absent genes ({number}) higher than the'
                                                 ' maximum allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_absent_genes'],
                                                           maximum=config['maximum_number_absent_genes']))
                            if sample_data_general['number_genes_multiple_alleles'] > \
                                    config['maximum_number_genes_multiple_alleles']:
                                exit_info.append('Number of genes with multiple alleles'
                                                 ' ({number}) higher than the maximum'
                                                 ' allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
                                                           maximum=config['maximum_number_genes_multiple_alleles']))

                            if len(exit_info) > 0:
                                print('\n' + '\n'.join(exit_info) + '\n')
                                e = 'TrueCoverage requirements not fulfilled'
                                print('\n' + e + '\n')
                                if not args.noCheckPoint:
                                    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                    _ = utils.runTime(start_time)
                                    sys.exit(e)
                        else:
                            e = 'TrueCoverage module did not run successfully'
                            print('\n' + e + '\n')
                            if not args.noCheckPoint:
                                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                _ = utils.runTime(start_time)
                                sys.exit(e)

                        print('\n')
                        typing_dir = os.path.join(rematch_dir, 'typing', '')
                        if not os.path.isdir(typing_dir):
                            os.makedirs(typing_dir)
                        run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
                        if run_successfully:
                            run_successfully = indexAlignment(bam_file)
                            if run_successfully:
                                reference_file = os.path.join(typing_dir, 'reference.fasta')
                                write_sequeces(reference_file, typing_sequences)
                                index_fasta_samtools(reference_file, None, None, True)
                                rematch_dir = str(typing_dir)
                if not run_successfully:
                    if args.noCheckPoint:
                        clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                        _ = utils.runTime(start_time)
                        sys.exit('Something in the required TrueCoverage analysis went wrong')
            else:
                print('\n'
                      'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
                      '\n')

        if run_successfully:
            config = parse_config(typing_config)
            if args.minGeneCoverage is not None:
                config['minimum_gene_coverage'] = args.minGeneCoverage
            if args.minGeneIdentity is not None:
                config['minimum_gene_identity'] = args.minGeneIdentity

            runtime, run_successfully, sample_data_general, data_by_gene = \
                run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
                                        config['length_extra_seq'], config['minimum_depth_presence'],
                                        config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
                                        config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                        args.debug, args.doNotRemoveConsensus)
            if run_successfully and data_by_gene is not None:
                if args.minGeneDepth is None:
                    args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                        sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                        15

                _, _, _ = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'],
                                        config['minimum_gene_identity'], args.minGeneDepth, args.outdir)
            else:
                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                _ = utils.runTime(start_time)
                sys.exit('ReMatCh run for pathotyping did not run successfully')
        else:
            clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
            _ = utils.runTime(start_time)
            sys.exit('Something did not run successfully')

    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)

    print('\n')
    _ = utils.runTime(start_time)
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        prog='rematch.py',
        description=
        'Reads mapping against target sequences, checking mapping and consensus sequences production',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version',
                        help='Version information',
                        action='version',
                        version=str('%(prog)s v' + version))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument(
        '-r',
        '--reference',
        type=argparse.FileType('r'),
        metavar='/path/to/reference_sequence.fasta',
        help='Fasta file containing reference sequences',
        required=True)

    parser_optional_general = parser.add_argument_group(
        'General facultative options')
    parser_optional_general.add_argument(
        '-w',
        '--workdir',
        type=str,
        metavar='/path/to/workdir/directory/',
        help=
        'Path to the directory where ReMatCh will run and produce the outputs with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end direction coded as _R1_001 / _R2_001 or _1 / _2) already present (organized in sample folders) or to be downloaded',
        required=False,
        default='.')
    parser_optional_general.add_argument('-j',
                                         '--threads',
                                         type=int,
                                         metavar='N',
                                         help='Number of threads to use',
                                         required=False,
                                         default=1)
    parser_optional_general.add_argument(
        '--doNotUseProvidedSoftware',
        action='store_true',
        help=
        'Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are provided with it'
    )

    parser_optional_rematch = parser.add_argument_group(
        'ReMatCh module facultative options')
    parser_optional_rematch.add_argument(
        '--conservedSeq',
        action='store_true',
        help=
        'This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm'
    )
    parser_optional_rematch.add_argument(
        '--extraSeq',
        type=int,
        metavar='N',
        help=
        'Sequence length added to both ends of target sequences (usefull to improve reads mapping to the target one) that will be trimmed in ReMatCh outputs',
        required=False,
        default=0)
    parser_optional_rematch.add_argument(
        '--minCovPresence',
        type=int,
        metavar='N',
        help=
        'Reference position minimum coverage depth to consider the position to be present in the sample',
        required=False,
        default=5)
    parser_optional_rematch.add_argument(
        '--minCovCall',
        type=int,
        metavar='N',
        help=
        'Reference position minimum coverage depth to perform a base call. Lower coverage will be coded as N',
        required=False,
        default=10)
    parser_optional_rematch.add_argument(
        '--minFrequencyDominantAllele',
        type=float,
        metavar='0.6',
        help=
        'Minimum relative frequency of the dominant allele coverage depth (value between [0, 1]). Positions with lower values will be considered as having multiple alleles (and will be coded as N)',
        required=False,
        default=0.6)
    parser_optional_rematch.add_argument(
        '--minGeneCoverage',
        type=int,
        metavar='N',
        help=
        'Minimum percentage of target reference gene sequence covered by --minCovPresence to consider a gene to be present (value between [0, 100])',
        required=False,
        default=80)
    parser_optional_rematch.add_argument(
        '--minGeneIdentity',
        type=int,
        metavar='N',
        help=
        'Minimum percentage of identity of reference gene sequence covered by --minCovCall to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference',
        required=False,
        default=70)
    parser_optional_rematch.add_argument(
        '--numMapLoc',
        type=int,
        metavar='N',
        help=
        'Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)',
        required=False,
        default=1)
    parser_optional_rematch.add_argument(
        '--doubleRun',
        action='store_true',
        help=
        'Tells ReMatCh to run a second time using as reference the noMatter consensus sequence produced in the first run. This will improve consensus sequence determination for sequences with high percentage of target reference gene sequence covered'
    )
    parser_optional_rematch.add_argument(
        '--debug',
        action='store_true',
        help='DeBug Mode: do not remove temporary files')

    parser_optional_download = parser.add_argument_group(
        'Download facultative options')
    parser_optional_download.add_argument(
        '-a',
        '--asperaKey',
        type=argparse.FileType('r'),
        metavar='/path/to/asperaweb_id_dsa.openssh',
        help=
        'Tells ReMatCh to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh must be provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).',
        required=False)
    parser_optional_download.add_argument(
        '-k',
        '--keepDownloadedFastq',
        action='store_true',
        help='Tells ReMatCh to keep the fastq files downloaded')
    parser_optional_download.add_argument(
        '--downloadLibrariesType',
        type=str,
        metavar='PAIRED',
        help='Tells ReMatCh to download files with specific library layout',
        choices=['PAIRED', 'SINGLE', 'BOTH'],
        required=False,
        default='BOTH')
    parser_optional_download.add_argument(
        '--downloadInstrumentPlatform',
        type=str,
        metavar='ILLUMINA',
        help='Tells ReMatCh to download files with specific library layout',
        choices=['ILLUMINA', 'ALL'],
        required=False,
        default='ILLUMINA')
    parser_optional_download.add_argument(
        '--downloadCramBam',
        action='store_true',
        help=
        'Tells ReMatCh to also download cram/bam files and convert them to fastq files'
    )

    parser_optional_download_exclusive = parser.add_mutually_exclusive_group()
    parser_optional_download_exclusive.add_argument(
        '-l',
        '--listIDs',
        type=argparse.FileType('r'),
        metavar='/path/to/list_IDs.txt',
        help='Path to list containing the IDs to be downloaded (one per line)',
        required=False)
    parser_optional_download_exclusive.add_argument(
        '-t',
        '--taxon',
        type=str,
        metavar='"Streptococcus agalactiae"',
        help='Taxon name for which ReMatCh will download fastq files',
        required=False)

    args = parser.parse_args()

    if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1:
        parser.error(
            '--minFrequencyDominantAllele should be a value between [0, 1]')

    if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
        parser.error('--minGeneCoverage should be a value between [0, 100]')

    start_time = time.time()

    number_samples_successfully, samples_total_number = runRematch(args)

    print '\n' + 'END ReMatCh'
    print '\n' + str(number_samples_successfully) + ' samples out of ' + str(
        samples_total_number) + ' run successfully'
    time_taken = utils.runTime(start_time)
    del time_taken

    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
Example #7
0
def runRematch(args):
    workdir = os.path.abspath(args.workdir)
    if not os.path.isdir(workdir):
        os.makedirs(workdir)

    asperaKey = os.path.abspath(
        args.asperaKey.name) if args.asperaKey is not None else None

    # Start logger
    logfile, time_str = utils.start_logger(workdir)

    # Get general information
    utils.general_information(logfile, version, workdir, time_str,
                              args.doNotUseProvidedSoftware, asperaKey,
                              args.downloadCramBam)

    # Set listIDs
    listIDs, searched_fastq_files = getListIDs(
        workdir, args.listIDs.name if args.listIDs is not None else None,
        args.taxon)

    # Run ReMatCh for each sample
    print '\n' + 'STARTING ReMatCh' + '\n'

    # Clean sequences headers
    reference_file, gene_list_reference = clean_headers_reference_file(
        os.path.abspath(args.reference.name), workdir, args.extraSeq)

    if len(gene_list_reference) == 0:
        sys.exit('No sequences left')

    # To use in combined report

    number_samples_successfully = 0
    for sample in listIDs:
        sample_start_time = time.time()
        print '\n\n' + 'Sample ID: ' + sample

        # Create sample outdir
        sample_outdir = os.path.join(workdir, sample, '')
        if not os.path.isdir(sample_outdir):
            os.mkdir(sample_outdir)

        run_successfully_fastq = None
        time_taken_fastq = 0
        sequencingInformation = {
            'run_accession': None,
            'instrument_platform': None,
            'instrument_model': None,
            'library_layout': None,
            'library_source': None,
            'extra_run_accession': None,
            'date_download': None
        }
        if not searched_fastq_files:
            # Download Files
            time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload(
                sample, args.downloadLibrariesType, asperaKey, sample_outdir,
                args.downloadCramBam, args.threads,
                args.downloadInstrumentPlatform)
        else:
            fastq_files = listIDs[sample]

        fileSize = None

        run_successfully_rematch_first = None
        run_successfully_rematch_second = None
        time_taken_rematch_first = 0
        time_taken_rematch_second = 0
        if run_successfully_fastq is not False:
            fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)
            # Run ReMatCh
            time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files = rematch_module.runRematchModule(
                sample, fastq_files, reference_file, args.threads,
                sample_outdir, args.extraSeq, args.minCovPresence,
                args.minCovCall, args.minFrequencyDominantAllele,
                args.minGeneCoverage, args.conservedSeq, args.debug,
                args.numMapLoc, args.minGeneIdentity)
            if run_successfully_rematch_first:
                write_data_by_gene(gene_list_reference, args.minGeneCoverage,
                                   sample, data_by_gene, workdir, time_str,
                                   'first_run', args.minGeneIdentity)
                if args.doubleRun:
                    rematch_second_outdir = os.path.join(
                        sample_outdir, 'rematch_second_run', '')
                    if not os.path.isdir(rematch_second_outdir):
                        os.mkdir(rematch_second_outdir)
                    consensus_concatenated_fasta, consensus_concatenated_gene_list = concatenate_extraSeq_2_consensus(
                        consensus_files['noMatter'], reference_file,
                        args.extraSeq, rematch_second_outdir)
                    if len(consensus_concatenated_gene_list) > 0:
                        time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files = rematch_module.runRematchModule(
                            sample, fastq_files, consensus_concatenated_fasta,
                            args.threads, rematch_second_outdir, args.extraSeq,
                            args.minCovPresence, args.minCovCall,
                            args.minFrequencyDominantAllele,
                            args.minGeneCoverage, args.conservedSeq,
                            args.debug, args.numMapLoc, args.minGeneIdentity)
                        if not args.debug:
                            os.remove(consensus_concatenated_fasta)
                        if run_successfully_rematch_second:
                            write_data_by_gene(gene_list_reference,
                                               args.minGeneCoverage, sample,
                                               data_by_gene, workdir, time_str,
                                               'second_run',
                                               args.minGeneIdentity)
                    else:
                        print 'No sequences left after ReMatCh module first run. Second run will not be performed'

        if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None:
            for fastq in fastq_files:
                if os.path.isfile(fastq):
                    os.remove(fastq)

        time_taken = utils.runTime(sample_start_time)

        write_sample_report(
            sample, workdir, time_str, fileSize, run_successfully_fastq,
            run_successfully_rematch_first, run_successfully_rematch_second,
            time_taken_fastq, time_taken_rematch_first,
            time_taken_rematch_second, time_taken, sequencingInformation,
            sample_data_general_first if run_successfully_rematch_first else {
                'number_absent_genes': None,
                'number_genes_multiple_alleles': None,
                'mean_sample_coverage': None
            }, sample_data_general_second
            if run_successfully_rematch_second else {
                'number_absent_genes': None,
                'number_genes_multiple_alleles': None,
                'mean_sample_coverage': None
            }, fastq_files if fastq_files is not None else '')

        if all([
                run_successfully_fastq is not False,
                run_successfully_rematch_first is not False,
                run_successfully_rematch_second is not False
        ]):
            number_samples_successfully += 1

    return number_samples_successfully, len(listIDs)
Example #8
0
def main():
    program_name = 'ecoli_stx_subtyping.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, parser_reads, _, parser_assembly, _ = python_arguments(
        program_name=program_name, version=version)
    parser.description = 'Gets E. coli stx subtypes'

    # Add specific arguments
    parser_reads.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_reads.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    parser_assembly.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_assembly.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    args = parser.parse_args()

    msg = []
    if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
        msg.append('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
        msg.append('--minGeneIdentity should be a value between [0, 100]')
    if args.stx2covered < 0 or args.stx2covered > 100:
        msg.append('--stx2covered should be a value between [0, 100]')
    if args.stx2identity < 0 or args.stx2identity > 100:
        msg.append('--stx2identity should be a value between [0, 100]')
    if args.org != ['stx', 'subtyping']:
        msg.append('Use "--org stx subtyping" with {}'.format(program_name))

    if len(msg) > 0:
        argparse.ArgumentParser(prog='{} options'.format(program_name)).error(
            '\n'.join(msg))

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    _ = utils.general_information(script_name=program_name,
                                  logfile=logfile,
                                  version=version,
                                  outdir=args.outdir,
                                  time_str=time_str)
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    stx1_result, stx2_result = stx_subtype_parser(
        os.path.join(args.outdir, 'seq_typing.report_types.tab'), [
            ref_file for ref_file in reference
            if 'stx1' in os.path.basename(ref_file).lower()
        ][0], [
            ref_file for ref_file in reference
            if 'stx2' in os.path.basename(ref_file).lower()
        ][0], args.stx2covered, args.stx2identity)

    # Rename the file to keep ecoli_stx_subtyping stamp
    if os.path.isfile(os.path.join(args.outdir,
                                   'seq_typing.report_types.tab')):
        os.rename(
            os.path.join(args.outdir, 'seq_typing.report_types.tab'),
            os.path.join(args.outdir,
                         'seq_typing.ecoli_stx_subtyping.report_types.tab'))

    # Remove the file to only keep the ecoli_stx_subtyping one
    if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report.txt')):
        os.remove(os.path.join(args.outdir, 'seq_typing.report.txt'))

    print('\n'
          'E. coli stx_subtyping - {stx1_result}:{stx2_result}\n'
          '\n'.format(stx1_result=stx1_result, stx2_result=stx2_result))
    with open(os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.txt'),
              'wt') as writer:
        writer.write(':'.join([stx1_result, stx2_result]))

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)
Example #9
0
def main():
    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 get_stx_db.py"')

    parser = argparse.ArgumentParser(
        prog='get_stx_db.py',
        description=
        'Gets STX sequences from virulencefinder_db to produce a STX subtyping'
        ' DB',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version',
                        help='Version information',
                        action='version',
                        version=str('%(prog)s v' + version))

    parser_optional_general = parser.add_argument_group(
        'General facultative options')
    parser_optional_general.add_argument(
        '-o',
        '--outdir',
        type=str,
        metavar='/path/to/output/directory/',
        help='Path to the directory where the sequences will be stored',
        required=False,
        default='.')

    args = parser.parse_args()

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Get virulencefinder_db
    url = 'https://bitbucket.org/genomicepidemiology/virulencefinder_db.git'
    virulencefinder_db = os.path.join(args.outdir, 'virulence_db', '')
    run_successfully, _, _ = utils.runCommandPopenCommunicate(
        ['git', 'clone', url, virulencefinder_db], False, None, True)
    _, commit, _ = utils.runCommandPopenCommunicate([
        'git', '-C', virulencefinder_db, 'log', '--pretty=format:"%h"', '-n',
        '1'
    ], True, 15, True)

    # Get STX sequences
    stx_seq = {}
    # stx_seq_write = []
    allowed_chars = set(Seq.IUPAC.IUPACData.unambiguous_dna_letters)
    with open(os.path.join(
            args.outdir,
            'virulence_db.virulence_ecoli.commit_{commit}.problematic_sequences.tab'
            .format(commit=commit)),
              'wt',
              newline='\n') as writer:
        for seq in SeqIO.parse(
                os.path.join(virulencefinder_db, 'virulence_ecoli.fsa'),
                'fasta'):
            if seq.id.lower().startswith('stx'):
                subtype = seq.id.split(':')
                if len(subtype) == 4:
                    if seq.id[:4] not in stx_seq:
                        stx_seq[seq.id[:4]] = []
                    '''
                    Jani
                    
                    After spending what seemed to be an endless amount of hours trying to solve the STEC stx subtype
                    mystery I've come to the following conclusion. For the platform we need to combine in the target db
                    stx2a,  stx2c and  stx2d as one subtype called stx2acd. This is due to the fact that all of these
                    subtypes are the most potent ones to cause HUS and cannot be separated from each other by the
                    methods in use right now.
                    '''
                    if subtype[0][:4] == 'stx2' and subtype[3] in [
                            'a', 'c', 'd'
                    ]:
                        subtype[3] = 'acd'

                    subtype = subtype[0][:4] + subtype[3]  # Define subtype
                    # if subtype not in stx_seq[seq_name[3]]:
                    #     stx_seq[seq_name[3]][subtype] = []
                    seq.description = ''  # To avoid description to be print in outfile

                    # For sequences with IUPAC codes, use one possible sequence based on the one with the codes
                    if not set(seq.seq.upper()).issubset(allowed_chars):
                        # print(seq.id, set(seq.seq.upper()))
                        all_possible_sequences = extend_ambiguous_dna(
                            seq.seq.upper())
                        if all_possible_sequences is not None:
                            seq = SeqRecord(
                                Seq.Seq(all_possible_sequences[0],
                                        generic_dna),
                                id='{seq_name}:IUPAC_codes_removed'.format(
                                    seq_name=seq.id),
                                description='')  # Change the sequence
                        else:
                            writer.write('\t'.join([
                                seq.id, 'Memory Error (too much IUPAC codes)'
                            ]))
                            continue

                    seq.id = '{seq_name}:seqTyping_{subtype}'.format(
                        seq_name=seq.id, subtype=subtype)
                    stx_seq[seq.id[:4]].append(seq)
                    # stx_seq_write.append(seq)

    # Write files
    for gene, seqs in stx_seq.items():
        with open(os.path.join(
                args.outdir,
                'virulence_db.virulence_ecoli.commit_{commit}.{gene}_subtyping.seq_typing.fasta'
                .format(commit=commit, gene=gene)),
                  'wt',
                  newline='\n') as writer:
            _ = SeqIO.write(seqs, writer, "fasta")

    # print(len(stx_seq))
    # for gene, subtype_dict in stx_seq.items():
    #     print(gene, len(subtype_dict))
    #     for subtype, seqs in subtype_dict.items():
    #         print(subtype, len(seqs))

    _ = utils.runTime(start_time)