Exemple #1
0
def runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding):
    failing = {'sample': False}
    not_empty_fastq = False
    warnings = {}

    paired_reads = None
    fileSize = 'NA'

    # Create Trimmomatic output directory
    trimmomatic_folder = os.path.join(outdir, 'trimmomatic', '')
    utils.removeDirectory(trimmomatic_folder)
    os.mkdir(trimmomatic_folder)

    run_successfully = trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding)

    if run_successfully:
        paired_reads = getTrimmomaticPairedReads(trimmomatic_folder)
        not_empty_fastq = controlForZeroReads(paired_reads)

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in paired_reads)

        if not not_empty_fastq:
            warnings['sample'] = 'Zero reads after Trimmomatic'
            print warnings['sample']

    else:
        failing['sample'] = 'Did not run'
        print failing['sample']

    return run_successfully, not_empty_fastq, failing, paired_reads, trimmomatic_folder, fileSize, warnings
Exemple #2
0
def runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory):
	failing = {}
	failing['sample'] = False
	not_empty_fastq = False

	paired_reads = None
	fileSize = 'NA'

	# Create Trimmomatic output directory
	trimmomatic_folder = os.path.join(outdir, 'trimmomatic', '')
	utils.removeDirectory(trimmomatic_folder)
	os.mkdir(trimmomatic_folder)

	run_successfully = trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, adaptersFasta, script_path, doNotSearchAdapters, fastq_files, maxReadsLength, doNotTrimCrops, crop, headCrop, leading, trailing, slidingWindow, minLength, nts2clip_based_ntsContent, jarMaxMemory)

	if run_successfully:
		paired_reads = getTrimmomaticPairedReads(trimmomatic_folder)
		not_empty_fastq = controlForZeroReads(paired_reads)

		# Get raw reads files size
		fileSize = sum(os.path.getsize(fastq) for fastq in paired_reads)

		if not not_empty_fastq:
			failing['sample'] = 'Zero reads after Trimmomatic'
			print failing['sample']

	else:
		failing['sample'] = 'Did not run'
		print failing['sample']

	return run_successfully, not_empty_fastq, failing, paired_reads, trimmomatic_folder, fileSize
Exemple #3
0
def runSpades(sampleName, outdir, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, minContigsLength, estimatedGenomeSizeMb, kmers, maximumReadsLength, defaultKmers, minCoverageContigs, assembled_se_reads, saveExcludedContigs, maxNumberContigs):
    pass_qc = True
    failing = {}
    failing['sample'] = False
    warnings = {}

    # Create SPAdes output directory
    spades_folder = os.path.join(outdir, 'spades', '')
    utils.removeDirectory(spades_folder)
    os.mkdir(spades_folder)

    # Determine k-mers to run
    if defaultKmers:
        kmers = []
    else:
        kmers = define_kmers(kmers, maximumReadsLength)
        if len(kmers) == 0:
            print 'SPAdes will use its default k-mers'
        else:
            print 'SPAdes will use the following k-mers: ' + str(kmers)

    run_successfully, contigs = spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCoverageAssembly, kmers, assembled_se_reads)

    if run_successfully:
        shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta')))

        contigs_link = os.path.join(outdir, str(sampleName + '.contigs.fasta'))
        os.symlink(contigs, contigs_link)

        contigs = contigs_link

        minContigsLength = define_minContigsLength(maximumReadsLength, minContigsLength)

        sequence_dict = get_SPAdes_sequence_information(contigs)

        warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = decide_filter_parameters(sequence_dict, minContigsLength, minCoverageContigs, estimatedGenomeSizeMb, maxNumberContigs)

        if filtered_sequences_sufix is not None:
            filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta'
            write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, False, saveExcludedContigs)
            contigs = filtered_sequence_file
        else:
            filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta'
            write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs, filtered_sequence_file, sampleName, True, False)
            contigs = filtered_sequence_file

        os.remove(contigs_link)

    else:
        failing['sample'] = 'Did not run'
        print failing['sample']
        contigs = None
        pass_qc = False

    if not run_successfully:
        pass_qc = False

    utils.removeDirectory(spades_folder)

    return run_successfully, pass_qc, failing, contigs, warnings
def runTrueCoverage(sample, fastq, reference, threads, outdir, extra_seq, min_cov_presence, min_cov_call,
                    min_frequency_dominant_allele, min_gene_coverage, debug, min_gene_identity,
                    true_coverage_config, rematch_script, conserved_true=True, num_map_loc=1):
    pass_qc = False
    failing = {}

    true_coverage_folder = os.path.join(outdir, 'trueCoverage', '')
    utils.removeDirectory(true_coverage_folder)
    os.mkdir(true_coverage_folder)

    sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules', ''))
    import rematch_module

    # Run ReMatCh
    reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference, true_coverage_folder,
                                                                                       extra_seq, rematch_module)
    time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq, reference_file, threads, true_coverage_folder, extra_seq, min_cov_presence, min_cov_call, min_frequency_dominant_allele, min_gene_coverage, conserved_true, debug, num_map_loc, min_gene_identity, 'first', 7, 'none', reference_dict, 'X', None, gene_list_reference, True)

    if run_successfully:
        failing = rematch_report_assess_failing(outdir, None, true_coverage_folder, sample_data_general, true_coverage_config)
    else:
        failing['sample'] = 'Did not run'

    if len(failing) == 0:
        pass_qc = True
        failing['sample'] = False
    else:
        print failing

    if not debug:
        utils.removeDirectory(true_coverage_folder)

    return run_successfully, pass_qc, failing
Exemple #5
0
def sequence_data(reference_file, bam_file, outdir, threads, length_extra_seq,
                  minimum_depth_presence, minimum_depth_call,
                  minimum_depth_frequency_dominant_allele):
    sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
    utils.removeDirectory(sequence_data_outdir)
    os.makedirs(sequence_data_outdir)

    sequences = get_sequence_information(reference_file)

    pool = multiprocessing.Pool(processes=threads)
    for sequence_counter in sequences:
        sequence_dir = os.path.join(sequence_data_outdir,
                                    str(sequence_counter), '')
        utils.removeDirectory(sequence_dir)
        os.makedirs(sequence_dir)
        pool.apply_async(analyse_sequence_data,
                         args=(
                             bam_file,
                             sequences[sequence_counter],
                             sequence_dir,
                             sequence_counter,
                             reference_file,
                             length_extra_seq,
                             minimum_depth_presence,
                             minimum_depth_call,
                             minimum_depth_frequency_dominant_allele,
                         ))
    pool.close()
    pool.join()

    run_successfully, sample_data = gather_gene_data_together(
        sequence_data_outdir, sequences)

    return run_successfully, sample_data
def sample_coverage(referenceFile, alignment_file, outdir, threads):
    coverage_outdir = os.path.join(outdir, 'samtools_depth', '')
    utils.removeDirectory(coverage_outdir)
    os.makedirs(coverage_outdir)

    sequences = sequenceHeaders(referenceFile)

    pool = multiprocessing.Pool(processes=threads)
    counter = 0
    for sequence in sequences:
        pool.apply_async(get_sequence_coverage, args=(alignment_file, sequence, coverage_outdir, counter,))
        counter += 1
    pool.close()
    pool.join()

    sample_coverage_no_problems = True
    mean_coverage_data = {}
    files = [f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f))]
    for file_found in files:
        if file_found.startswith('coverage.sequence_') and file_found.endswith('.pkl'):
            file_path = os.path.join(coverage_outdir, file_found)

            if sample_coverage_no_problems:
                sequence_to_analyse, run_successfully, problems_found, position, coverage = utils.extractVariableFromPickle(file_path)
                if run_successfully and not problems_found:
                    mean_coverage_data[sequence_to_analyse] = {'position': position, 'coverage': coverage, 'mean_coverage': round((float(coverage) / float(position)), 2)}
                else:
                    print 'WARNING: it was not possible to compute coverage information for sequence ' + sequence_to_analyse
                    sample_coverage_no_problems = False

            os.remove(file_path)

    return sample_coverage_no_problems, mean_coverage_data
Exemple #7
0
def runDownload(ena_id, download_paired_type, asperaKey, outdir, download_cram_bam_True, threads, instrument_platform):
	download_dir = os.path.join(outdir, 'download', '')
	utils.removeDirectory(download_dir)
	os.mkdir(download_dir)

	run_successfully = False
	downloaded_files = None
	sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'date_download': None}

	readRunInfo = getReadRunInfo(ena_id)
	if readRunInfo is not None:
		downloadInformation = getDownloadInformation(readRunInfo)
		downloadInformation = check_correct_links(downloadInformation)
		sequencingInformation = getSequencingInformation(readRunInfo)
		sequencingInformation['date_download'] = time.strftime("%Y-%m-%d")

		if instrument_platform.lower() == 'all' or sequencingInformation['instrument_platform'].lower() == instrument_platform.lower():
			if download_paired_type.lower() == 'both' or sequencingInformation['library_layout'].lower() == download_paired_type.lower():
				run_successfully, cram_index_run_successfully = downloadFiles(downloadInformation, sequencingInformation, download_paired_type, asperaKey, download_dir, download_cram_bam_True)
				if run_successfully:
					run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, sequencingInformation['library_layout'])
				if run_successfully and downloaded_files is not None:
					run_successfully, downloaded_files = rename_move_files(downloaded_files, sequencingInformation['run_accession'], outdir, sequencingInformation['library_layout'])

	utils.removeDirectory(download_dir)

	return run_successfully, downloaded_files, sequencingInformation
Exemple #8
0
def runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, keepFiles,
                      fastQC_run_name):
    pass_qc = False
    failing = {}
    failing['sample'] = False

    warnings = {}

    maximumReadsLength = None
    nts2clip_based_ntsContent = None

    # Create FastQC output directory
    fastqc_folder = os.path.join(outdir, str('fastqc_' + fastQC_run_name), '')
    utils.removeDirectory(fastqc_folder)
    os.mkdir(fastqc_folder)

    # Run FastQC
    run_successfully = fastQC(fastqc_folder, threads, adaptersFasta,
                              fastq_files)
    if run_successfully:
        # Check whether FastQC really run_successfully
        run_successfully = check_FastQC_runSuccessfully(
            fastqc_folder, fastq_files)
        if not run_successfully:
            failing['sample'] = 'Did not run'
            return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent

        # Check which reads pass FastQC
        goodReads, badReads, failing, warnings = parseFastQC(
            fastqc_folder, fastq_files)
        # Get reads information
        maximumReadsLength, moreFrequentReadsLength, numberReads, ntsContent_biasStatus = getReadsInformation(
            fastqc_folder, fastq_files)
        # Get number nucleotides to clip based on nucleotide content bias
        nts2clip_based_ntsContent = nts2clip(ntsContent_biasStatus)

        print "Number of reads found: " + str(numberReads)
        print "Maximum reads length found for both fastq files: " + str(
            maximumReadsLength) + " nts"
        print "Reads length class more frequently found in fastq files: " + str(
            moreFrequentReadsLength)
        if len(badReads) == 0:
            pass_qc = True
        elif len(badReads) > 0:
            print "Reads files FAILING FastQC control: " + str(badReads)
        if len(goodReads) > 0:
            print "Reads files passing FastQC control: " + str(goodReads)
        print 'To improve reads quality, consider clipping the next number of nucleotides in the fastq files at 5 end and 3 end, respectively: ' + str(
            nts2clip_based_ntsContent)
    else:
        failing['sample'] = 'Did not run'
        print failing['sample']

    if not keepFiles:
        utils.removeDirectory(fastqc_folder)

    return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent
Exemple #9
0
def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir,
             jarMaxMemory, alignment_file):
    failing = {}
    failing['sample'] = False

    pilon_folder = os.path.join(outdir, 'pilon', '')
    utils.removeDirectory(pilon_folder)
    os.mkdir(pilon_folder)

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(pilon_folder, os.path.basename(assembly))
    os.symlink(assembly, assembly_link)

    run_successfully = True

    if alignment_file is None:
        # Index assembly using Bowtie2
        run_successfully = indexSequenceBowtie2(assembly_link, threads)

        if run_successfully:
            run_successfully, sam_file = mappingBowtie2(
                fastq_files, assembly_link, threads, pilon_folder)

            if run_successfully:
                alignment_file = os.path.splitext(sam_file)[0] + '.bam'
                run_successfully, alignment_file = sortAlignment(
                    sam_file, alignment_file, False, threads)

                if run_successfully:
                    os.remove(sam_file)
                    run_successfully = indexAlignment(alignment_file)

    assembly_polished = None

    if run_successfully:
        run_successfully, assembly_polished = pilon(jar_path_pilon,
                                                    assembly_link,
                                                    alignment_file,
                                                    pilon_folder, jarMaxMemory)

        if run_successfully:
            parsePilonResult(assembly_polished, outdir)
            shutil.copyfile(
                assembly_polished,
                os.path.join(outdir, os.path.basename(assembly_polished)))
            assembly_polished = os.path.join(
                outdir, os.path.basename(assembly_polished))

    if os.path.isfile(alignment_file):
        os.remove(alignment_file)

    if not run_successfully:
        failing['sample'] = 'Did not run'
        print failing['sample']

    return run_successfully, None, failing, assembly_polished, pilon_folder
def run_true_coverage(sample,
                      fastq,
                      reference,
                      threads,
                      outdir,
                      extra_seq,
                      min_cov_presence,
                      min_cov_call,
                      min_frequency_dominant_allele,
                      min_gene_coverage,
                      debug,
                      min_gene_identity,
                      true_coverage_config,
                      rematch_script,
                      num_map_loc=1,
                      bowtie_algorithm='--very-sensitive-local',
                      clean_run_rematch=True):
    pass_qc = False
    failing = {}

    true_coverage_folder = os.path.join(outdir, 'trueCoverage', '')
    utils.removeDirectory(true_coverage_folder)
    os.mkdir(true_coverage_folder)

    sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules'))
    import rematch_module

    # Run ReMatCh
    reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(
        reference, true_coverage_folder, extra_seq, rematch_module)
    time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = \
        rematch_module.run_rematch_module(sample, fastq, reference_file, threads, true_coverage_folder, extra_seq,
                                          min_cov_presence, min_cov_call, min_frequency_dominant_allele,
                                          min_gene_coverage, debug, num_map_loc, min_gene_identity, 'first', 7, 'none',
                                          reference_dict, 'X', bowtie_algorithm, None, gene_list_reference, True,
                                          clean_run=clean_run_rematch)

    if run_successfully:
        failing = rematch_report_assess_failing(outdir, None,
                                                true_coverage_folder,
                                                sample_data_general,
                                                true_coverage_config)
    else:
        failing['sample'] = 'Did not run'

    if len(failing) == 0:
        pass_qc = True
        failing['sample'] = False
    else:
        print(failing)

    if not debug:
        utils.removeDirectory(true_coverage_folder)

    return run_successfully, pass_qc, failing, sample_data_general
Exemple #11
0
def runFastQintegrity(fastq_files, threads, outdir):
    pass_qc = True
    failing = {}
    failing['sample'] = False
    not_corruption_found = True

    fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
    utils.removeDirectory(fastQintegrity_folder)
    os.mkdir(fastQintegrity_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,))
    pool.close()
    pool.join()

    encoding = {}
    files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
            if file_run_successfully:
                encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
            else:
                failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt']
                print(os.path.splitext(file_found)[0] + ': the file is possibly corrupt')
        os.remove(os.path.join(fastQintegrity_folder, file_found))

    if len(failing) > 1:
        failing.pop('sample')
        not_corruption_found = False
        pass_qc = False

        min_reads_length_found, max_reads_length_found = None, None

    if len(encoding) == 0:
        encoding = None
        print('It was no possible to determine the FASTQ encodings')
    else:
        min_reads_length_found, max_reads_length_found, min_reads_length_each_fastq, max_reads_length_each_fastq = \
            guess_encoding.determine_min_max_reads_length(encoding)
        report_reads_length(min_reads_length_each_fastq, max_reads_length_each_fastq, outdir)

        if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1:
            encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0]
            print('Fastq quality encoding: {0}'.format(str(encoding)))
        else:
            print('It was no possible to determine the FASTQ encodings')
            print('This was what has been found: {0}'.format(str(encoding)))
            encoding = None

    utils.removeDirectory(fastQintegrity_folder)

    return not_corruption_found, pass_qc, failing, encoding, min_reads_length_found, max_reads_length_found
Exemple #12
0
def sample_coverage(referenceFile, alignment_file, outdir, threads):
    coverage_outdir = os.path.join(outdir, 'samtools_depth', '')
    utils.removeDirectory(coverage_outdir)
    os.makedirs(coverage_outdir)

    sequences = sequenceHeaders(referenceFile)

    pool = multiprocessing.Pool(processes=threads)
    counter = 0
    for sequence in sequences:
        pool.apply_async(get_sequence_coverage,
                         args=(
                             alignment_file,
                             sequence,
                             coverage_outdir,
                             counter,
                         ))
        counter += 1
    pool.close()
    pool.join()

    sample_coverage_no_problems = True
    mean_coverage_data = {}
    files = [
        f for f in os.listdir(coverage_outdir) if not f.startswith('.')
        and os.path.isfile(os.path.join(coverage_outdir, f))
    ]
    for file_found in files:
        if file_found.startswith('coverage.sequence_') and file_found.endswith(
                '.pkl'):
            file_path = os.path.join(coverage_outdir, file_found)

            if sample_coverage_no_problems:
                sequence_to_analyse, run_successfully, problems_found, position, coverage = \
                    utils.extractVariableFromPickle(file_path)
                if run_successfully and not problems_found:
                    mean_coverage_data[sequence_to_analyse] = {
                        'position':
                        position,
                        'coverage':
                        coverage,
                        'mean_coverage':
                        round((float(coverage) / float(position)), 2)
                    }
                else:
                    print(
                        'WARNING: it was not possible to compute coverage information for'
                        ' sequence ' + sequence_to_analyse)
                    sample_coverage_no_problems = False

            os.remove(file_path)

    return sample_coverage_no_problems, mean_coverage_data
Exemple #13
0
def runGetSeqENA(args):
    start_time = time.time()

    listENA_IDs = utils.getListIDs(os.path.abspath(args.listENAids.name))
    outdir = os.path.abspath(args.outdir)
    utils.check_create_directory(outdir)
    asperaKey = args.asperaKey
    if asperaKey is not None:
        asperaKey = os.path.abspath(asperaKey.name)

    # Start logger
    logfile = utils.start_logger(outdir)

    # Get general information
    utils.general_information(logfile, version)

    # Check programms
    requiredPrograms(args)

    runs_successfully = 0
    with open(os.path.join(outdir, 'getSeqENA.report.txt'), 'wt') as writer:
        header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download']
        writer.write('#sample' + '\t' + '\t'.join(header_sequencing) + '\n')
        for ena_id in listENA_IDs:
            if args.maximumSamples is None:
                maximumSamples = runs_successfully + 1
            else:
                maximumSamples = args.maximumSamples

            if runs_successfully < maximumSamples:
                print '\n' + 'Download ENA_ID ' + ena_id

                ena_id_folder = os.path.join(outdir, ena_id)
                utils.check_create_directory(ena_id_folder)

                sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
                time_taken, run_successfully, fastq_files, sequencingInformation = download.run_download(ena_id, args.downloadLibrariesType, asperaKey, ena_id_folder, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA, args.SRAopt)

                if run_successfully:
                    runs_successfully += 1
                else:
                    utils.removeDirectory(ena_id_folder)
                    print ena_id + ' was not downloaded'

                writer.write(ena_id + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\n')
            else:
                    break

    time_taken = utils.runTime(start_time)
    del time_taken

    if runs_successfully == 0:
        sys.exit('No ENA_IDs were successfully downloaded!')
Exemple #14
0
def runFastQintegrity(fastq_files, threads, outdir):
    pass_qc = True
    failing = {}
    failing['sample'] = False
    not_corruption_found = True

    fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
    utils.removeDirectory(fastQintegrity_folder)
    os.mkdir(fastQintegrity_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,))
    pool.close()
    pool.join()

    encoding = {}
    files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
            if file_run_successfully:
                encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
            else:
                failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt']
                print os.path.splitext(file_found)[0] + ': the file is possibly corrupt'
        os.remove(os.path.join(fastQintegrity_folder, file_found))

    if len(failing) > 1:
        failing.pop('sample')
        not_corruption_found = False
        pass_qc = False

    min_reads_length, max_reads_length = None, None

    if len(encoding) == 0:
        encoding = None
        print 'It was no possible to determine the FASTQ encodings'
    else:
        min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding)

        if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1:
            encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0]
            print 'Fastq quality encoding: ' + str(encoding)
        else:
            print 'It was no possible to determine the FASTQ encodings'
            print 'This was what has been found: ' + str(encoding)
            encoding = None

    utils.removeDirectory(fastQintegrity_folder)

    return not_corruption_found, pass_qc, failing, encoding, min_reads_length, max_reads_length
Exemple #15
0
def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, outdir):
    outdir_guess_encoding = os.path.join(outdir, os.path.splitext(os.path.basename(fastq_file))[0])
    utils.removeDirectory(outdir_guess_encoding)
    os.mkdir(outdir_guess_encoding)

    guess_encoding.guess_encoding(fastq_file, number_reads_access_None_all, outdir_guess_encoding)
    encoding_data = guess_encoding.gather_data_together(outdir_guess_encoding)
    final_enconding = guess_encoding.get_final_encoding(encoding_data)

    min_reads_length, max_reads_length, _, _ = guess_encoding.determine_min_max_reads_length(encoding_data)

    utils.removeDirectory(outdir_guess_encoding)
    return final_enconding, min_reads_length, max_reads_length
Exemple #16
0
def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, outdir):
    outdir_guess_encoding = os.path.join(outdir, os.path.splitext(os.path.basename(fastq_file))[0])
    utils.removeDirectory(outdir_guess_encoding)
    os.mkdir(outdir_guess_encoding)

    guess_encoding.guess_encoding(fastq_file, number_reads_access_None_all, outdir_guess_encoding)
    encoding_data = guess_encoding.gather_data_together(outdir_guess_encoding)
    final_enconding = guess_encoding.get_final_encoding(encoding_data)

    min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding_data)

    utils.removeDirectory(outdir_guess_encoding)
    return final_enconding, min_reads_length, max_reads_length
Exemple #17
0
def downloadAndINNUca(outdir, run_ID, asperaKey, threads):
    start_time = time.time()
    temp_file = os.path.join(outdir, run_ID + '.temp.runID_fileList.txt')
    with open(temp_file, 'wt') as writer:
        writer.write(run_ID + '\n')

    command = [
        'getSeqENA.py', '-l', temp_file, '-o', outdir, '-a', asperaKey,
        '--downloadLibrariesType', 'PE'
    ]
    getSeqENA_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None)

    os.remove(temp_file)

    sample_directory = os.path.join(outdir, run_ID, '')

    innuca_run_successfully = False
    if getSeqENA_run_successfully:
        command = [
            'INNUca.py', '-i', sample_directory, '-s',
            '"Campylobacter jejuni"', '-g', '1.6', '-o', sample_directory,
            '-j',
            str(threads), '--jarMaxMemory', 'auto'
        ]
        innuca_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
            command, False, None)

        innuca_dir = os.path.join(sample_directory, run_ID, '')
        files = [
            f for f in os.listdir(innuca_dir) if not f.startswith('.')
            and os.path.isfile(os.path.join(innuca_dir, f))
        ]
        for file_innuca in files:
            shutil.move(os.path.join(innuca_dir, file_innuca),
                        os.path.join(sample_directory, file_innuca))
        utils.removeDirectory(innuca_dir)

    removeFiles(sample_directory, '.gz')
    removeFiles(sample_directory, '.log')
    removeFiles(sample_directory, '.cpu.txt')

    if innuca_run_successfully:
        time_taken = utils.runTime(start_time)
        utils.saveVariableToPickle(time_taken, sample_directory,
                                   run_ID + '_downloadAndINNUca_time')

    utils.saveVariableToPickle(innuca_run_successfully, sample_directory,
                               run_ID + '_run_successfully')
Exemple #18
0
def runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, keepFiles, fastQC_run_name):
    pass_qc = False
    failing = {'sample': False}
    warnings = {}

    maximumReadsLength = None
    nts2clip_based_ntsContent = None

    # Create FastQC output directory
    fastqc_folder = os.path.join(outdir, str('fastqc_' + fastQC_run_name), '')
    utils.removeDirectory(fastqc_folder)
    os.mkdir(fastqc_folder)

    # Run FastQC
    run_successfully = fastQC(fastqc_folder, threads, adaptersFasta, fastq_files)
    if run_successfully:
        # Check whether FastQC really run_successfully
        run_successfully = check_FastQC_runSuccessfully(fastqc_folder, fastq_files)
        if not run_successfully:
            failing['sample'] = 'Did not run'
            return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent

        # Check which reads pass FastQC
        goodReads, badReads, failing, warnings = parseFastQC(fastqc_folder, fastq_files)
        # Get reads information
        maximumReadsLength, moreFrequentReadsLength, numberReads, ntsContent_biasStatus = getReadsInformation(fastqc_folder, fastq_files)
        # Get number nucleotides to clip based on nucleotide content bias
        nts2clip_based_ntsContent = nts2clip(ntsContent_biasStatus)

        print "Number of reads found: " + str(numberReads)
        print "Maximum reads length found for both fastq files: " + str(maximumReadsLength) + " nts"
        print "Reads length class more frequently found in fastq files: " + str(moreFrequentReadsLength)
        if len(badReads) == 0:
            pass_qc = True
        elif len(badReads) > 0:
            print "Reads files FAILING FastQC control: " + str(badReads)
        if len(goodReads) > 0:
            print "Reads files passing FastQC control: " + str(goodReads)
        print 'To improve reads quality, consider clipping the next number of nucleotides in the fastq files at 5 end and 3 end, respectively: ' + str(nts2clip_based_ntsContent)
    else:
        failing['sample'] = 'Did not run'
        print failing['sample']

    if not keepFiles:
        utils.removeDirectory(fastqc_folder)

    return run_successfully, pass_qc, failing, warnings, maximumReadsLength, nts2clip_based_ntsContent
Exemple #19
0
def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, jarMaxMemory, alignment_file):
    failing = {}
    failing['sample'] = False

    pilon_folder = os.path.join(outdir, 'pilon', '')
    utils.removeDirectory(pilon_folder)
    os.mkdir(pilon_folder)

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(pilon_folder, os.path.basename(assembly))
    os.symlink(assembly, assembly_link)

    run_successfully = True

    if alignment_file is None:
        # Index assembly using Bowtie2
        run_successfully = indexSequenceBowtie2(assembly_link, threads)

        if run_successfully:
            run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, pilon_folder)

            if run_successfully:
                alignment_file = os.path.splitext(sam_file)[0] + '.bam'
                run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads)

                if run_successfully:
                    os.remove(sam_file)
                    run_successfully = indexAlignment(alignment_file)

    assembly_polished = None

    if run_successfully:
        run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder, jarMaxMemory)

        if run_successfully:
            parsePilonResult(assembly_polished, outdir)
            shutil.copyfile(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished)))
            assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished))

    if os.path.isfile(alignment_file):
        os.remove(alignment_file)

    if not run_successfully:
        failing['sample'] = 'Did not run'
        print failing['sample']

    return run_successfully, None, failing, assembly_polished, pilon_folder
Exemple #20
0
def sequence_data(sample, reference_file, bam_file, outdir, threads,
                  length_extra_seq, minimum_depth_presence, minimum_depth_call,
                  minimum_depth_frequency_dominant_allele, debug_mode_true,
                  rematch):
    sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
    utils.removeDirectory(sequence_data_outdir)
    os.mkdir(sequence_data_outdir)

    sequences, headers = utils.get_sequence_information(
        reference_file, length_extra_seq)

    threads_2_use = rematch.determine_threads_2_use(len(sequences), threads)

    import multiprocessing

    pool = multiprocessing.Pool(processes=threads)
    for sequence_counter in sequences:
        sequence_dir = os.path.join(sequence_data_outdir,
                                    str(sequence_counter), '')
        utils.removeDirectory(sequence_dir)
        os.makedirs(sequence_dir)
        pool.apply_async(rematch.analyse_sequence_data,
                         args=(
                             bam_file,
                             sequences[sequence_counter],
                             sequence_dir,
                             sequence_counter,
                             reference_file,
                             length_extra_seq,
                             minimum_depth_presence,
                             minimum_depth_call,
                             minimum_depth_frequency_dominant_allele,
                             threads_2_use,
                         ))
    pool.close()
    pool.join()

    run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(
        sample, sequence_data_outdir, sequences,
        outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False)

    return run_successfully, sample_data, consensus_files, consensus_sequences
Exemple #21
0
def runFastQintegrity(fastq_files, threads, outdir):
    failing = {}
    failing['sample'] = False
    not_corruption_found = True

    fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '')
    utils.removeDirectory(fastQintegrity_folder)
    os.mkdir(fastQintegrity_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(fastQintegrity, args=(
            fastq,
            fastQintegrity_folder,
        ))
    pool.close()
    pool.join()

    files = [
        f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.')
        and os.path.isfile(os.path.join(fastQintegrity_folder, f))
    ]
    for file_found in files:
        if file_found.endswith('.pkl'):
            file_run_successfully = utils.extractVariableFromPickle(
                os.path.join(fastQintegrity_folder, file_found))
            if not file_run_successfully:
                failing[os.path.splitext(file_found)[0]] = [
                    'The file is possibly corrupt'
                ]
                print os.path.splitext(
                    file_found)[0] + ': the file is possibly corrupt'
        os.remove(os.path.join(fastQintegrity_folder, file_found))

    if len(failing) > 1:
        failing.pop('sample')
        not_corruption_found = False

    utils.removeDirectory(fastQintegrity_folder)

    return not_corruption_found, None, failing  # None added for consistency with other steps
Exemple #22
0
def gather_gene_data_together(data_directory, sequences_information):
    run_successfully = True
    counter = 0
    sample_data = {}

    genes_directories = [
        d for d in os.listdir(data_directory) if not d.startswith('.')
        and os.path.isdir(os.path.join(data_directory, d, ''))
    ]
    for gene_dir in genes_directories:
        gene_dir_path = os.path.join(data_directory, gene_dir, '')

        files = [
            f for f in os.listdir(gene_dir_path) if not f.startswith('.')
            and os.path.isfile(os.path.join(gene_dir_path, f))
        ]
        for file_found in files:
            if file_found.startswith('coverage_info.') and file_found.endswith(
                    '.pkl'):
                file_path = os.path.join(gene_dir_path, file_found)

                if run_successfully:
                    run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage = utils.extractVariableFromPickle(
                        file_path)
                    sample_data[sequence_counter] = {
                        'header':
                        sequences_information[sequence_counter]['header'],
                        'gene_coverage': 100 - percentage_absent,
                        'gene_low_coverage': percentage_lowCoverage,
                        'gene_number_positions_multiple_alleles':
                        multiple_alleles_found,
                        'gene_mean_read_coverage': meanCoverage
                    }
                    counter += 1

        utils.removeDirectory(gene_dir_path)

    if counter != len(sequences_information):
        run_successfully = False

    return run_successfully, sample_data
Exemple #23
0
def runPear(fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, minimum_overlap_reads):
    failing = {'sample': False}
    warnings = {}

    pear_folder = os.path.join(outdir, 'pear', '')
    utils.removeDirectory(pear_folder)
    os.mkdir(pear_folder)

    pool = multiprocessing.Pool(processes=threads)
    for fastq in fastq_files:
        pool.apply_async(compress_decompress, args=(fastq, os.path.join(pear_folder, str('temp.' + os.path.splitext(os.path.basename(fastq))[0])), False,))
    pool.close()
    pool.join()

    run_successfully, decompressed_reads = get_compressed_decompressed_reads(pear_folder)

    assembled_se_reads = None
    unassembled_pe_reads = None
    if run_successfully:
        if len(decompressed_reads) == 2:
            run_successfully, pass_qc, warnings, assembled_se_reads, unassembled_pe_reads, assembled_reads, unassembled_reads, discarded_reads = run_pear(decompressed_reads, sampleName, threads, pear_folder, fastq_encoding, trimmomatic_run_successfully, minimum_overlap_reads)
            if warnings['sample'] is False:
                warnings = {}
            if run_successfully:
                with open(os.path.join(outdir, str('pear_report.txt')), 'wt') as writer:
                    writer.write('#assembled_reads' + '\n' + str(assembled_reads) + '\n')
                    writer.write('#unassembled_reads' + '\n' + str(unassembled_reads) + '\n')
                    writer.write('#discarded_reads' + '\n' + str(discarded_reads) + '\n')
        else:
            run_successfully = False

        for fastq in decompressed_reads:
            os.remove(fastq)

    if not run_successfully:
        warnings['sample'] = 'Did not run'
        print warnings

    return run_successfully, True, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warnings
def runTrueCoverage(sample, fastq, reference, threads, outdir, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, conserved_True, debug, numMapLoc, minGeneIdentity, trueCoverage_config, rematch_script):
    pass_qc = False
    failing = {}

    trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '')
    utils.removeDirectory(trueCoverage_folder)
    os.mkdir(trueCoverage_folder)

    sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules', ''))
    import rematch_module

    # Run ReMatCh
    reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference, trueCoverage_folder, extraSeq, rematch_module)
    time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq, reference_file, threads, trueCoverage_folder, extraSeq, minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage, True, debug, 1, minGeneIdentity, 'first', 7, 'none', reference_dict, 'X', None, gene_list_reference, True)

    if run_successfully:
        print 'Writing report file'
        os.rename(os.path.join(trueCoverage_folder, 'rematchModule_report.txt'), os.path.join(outdir, 'trueCoverage_report.txt'))

        if sample_data_general['number_absent_genes'] > trueCoverage_config['maximum_number_absent_genes']:
            failing['absent_genes'] = 'The number of absent genes (' + str(sample_data_general['number_absent_genes']) + ') exceeds the maximum allowed (' + str(trueCoverage_config['maximum_number_absent_genes']) + ')'
        if sample_data_general['number_genes_multiple_alleles'] > trueCoverage_config['maximum_number_genes_multiple_alleles']:
            failing['multiple_alleles'] = 'The number of genes with multiple alleles (' + str(sample_data_general['number_genes_multiple_alleles']) + ') exceeds the maximum allowed (' + str(trueCoverage_config['maximum_number_genes_multiple_alleles']) + ')'
        if sample_data_general['mean_sample_coverage'] < trueCoverage_config['minimum_read_coverage']:
            failing['read_coverage'] = 'The mean read coverage for genes present (' + str(sample_data_general['mean_sample_coverage']) + ') dit not meet the minimum required (' + str(trueCoverage_config['minimum_read_coverage']) + ')'
    else:
        failing['sample'] = 'Did not run'

    if len(failing) == 0:
        pass_qc = True
        failing['sample'] = False
    else:
        print failing

    if not debug:
        utils.removeDirectory(trueCoverage_folder)

    return run_successfully, pass_qc, failing
Exemple #25
0
def run_rematch(rematch, outdir, reference_file, bam_file, threads,
                length_extra_seq, minimum_depth_presence, minimum_depth_call,
                minimum_depth_frequency_dominant_allele, minimum_gene_coverage,
                minimum_gene_identity, debug_mode_true, doNotRemoveConsensus):
    module_dir = os.path.join(outdir, 'rematch', '')
    utils.removeDirectory(module_dir)
    os.makedirs(module_dir)

    sys.path.append(os.path.join(os.path.dirname(rematch), 'modules', ''))
    import rematch_module as rematch

    print 'Analysing alignment data'
    run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data(
        'sample', reference_file, bam_file, module_dir, threads,
        length_extra_seq, minimum_depth_presence, minimum_depth_call,
        minimum_depth_frequency_dominant_allele, debug_mode_true, rematch)

    if run_successfully:
        number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = write_report(
            outdir, sample_data, minimum_gene_coverage, minimum_gene_identity)

    if not debug_mode_true:
        utils.removeDirectory(module_dir)

    clean_rematch_folder(consensus_files, bam_file, reference_file, outdir,
                         doNotRemoveConsensus, debug_mode_true)

    return run_successfully, {
        'number_absent_genes':
        number_absent_genes if 'number_absent_genes' in locals() else None,
        'number_genes_multiple_alleles':
        number_genes_multiple_alleles
        if 'number_genes_multiple_alleles' in locals() else None,
        'mean_sample_coverage':
        round(mean_sample_coverage, 2)
        if 'mean_sample_coverage' in locals() else None
    }, sample_data if 'sample_data' in locals() else None
def runAssemblyMapping(fastq_files, reference_file, threads, outdir, minCoverageAssembly, estimatedGenomeSizeMb, saveExcludedContigs, maxNumberContigs):
    pass_qc = True
    pass_qc_coverage = False
    pass_qc_mapping = False

    failing = {}
    warnings = {}

    assemblyMapping_folder = os.path.join(outdir, 'assemblyMapping', '')
    utils.removeDirectory(assemblyMapping_folder)
    os.mkdir(assemblyMapping_folder)

    assembly_filtered = None

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(assemblyMapping_folder, os.path.basename(reference_file))
    os.symlink(reference_file, assembly_link)

    bam_file = None
    # Index assembly using Bowtie2
    run_successfully = indexSequenceBowtie2(assembly_link, threads)

    sample_coverage_no_problems = False
    sample_mapping_statistics_no_problems = False
    if run_successfully:
        run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, assemblyMapping_folder)

        if run_successfully:
            bam_file = os.path.splitext(sam_file)[0] + '.bam'
            run_successfully, bam_file = sortAlignment(sam_file, bam_file, False, threads)

            if run_successfully:
                os.remove(sam_file)
                run_successfully = indexAlignment(bam_file, True)

                if run_successfully:
                    sequences_2_keep = []
                    # Get assembly coverage
                    sample_coverage_no_problems, mean_coverage_data = sample_coverage(reference_file, bam_file, assemblyMapping_folder, threads)
                    if sample_coverage_no_problems:
                        pass_qc_coverage, failing_reason, sequences_2_keep = save_assembly_coverage_report(mean_coverage_data, outdir, minCoverageAssembly)
                        if not pass_qc_coverage:
                            failing['Coverage'] = [failing_reason]

                        assembly_filtered = os.path.splitext(reference_file)[0] + '.mappingCov.fasta'

                        sequence_dict, ignore = utils.get_sequence_information(reference_file, 0)
                        sequence_dict, sequence_report_general = determine_sequences_to_filter(sequence_dict, sequences_2_keep, False)
                        failing_sequences_filtered, minimumBP = spades.qc_assembly(sequence_report_general, estimatedGenomeSizeMb, maxNumberContigs)
                        if failing_sequences_filtered['sample'] is not False:
                            warnings['Sequences_filtered'] = [failing_sequences_filtered['sample']]
                            if not minimumBP:
                                assembly_filtered = reference_file
                            else:
                                write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, saveExcludedContigs)
                        else:
                            write_filtered_sequences_and_stats(sequence_dict, sequence_report_general, assembly_filtered, saveExcludedContigs)
                    else:
                        failing['Coverage'] = ['Did not run']

                    # Save mapping statistics
                    sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(bam_file)
                    if sample_mapping_statistics_no_problems:
                        pass_qc_mapping, failing_reason = save_mapping_statistics(dict_mapping_statistics, outdir)

                        if not pass_qc_mapping:
                            warnings['Mapping'] = [failing_reason]
                    else:
                        warnings['Mapping'] = ['Did not run']

                    if assembly_filtered is not None and assembly_filtered != reference_file and len(sequences_2_keep) > 0:
                        print 'Producing bam subset for sequences to keep'
                        run_successfully, bam_subset = get_bam_subset(bam_file, sequences_2_keep, threads)
                        if run_successfully:
                            os.remove(bam_file)
                            os.remove(bam_file + '.bai')
                            bam_file = bam_subset
                            run_successfully = indexAlignment(bam_file, False)

    if not run_successfully:
        failing['sample'] = ['Did not run']

    run_successfully = all([run_successfully, sample_coverage_no_problems])

    if len(failing) == 0:
        failing = {'sample': False}
    else:
        print 'Failing:', failing
        pass_qc = False

    return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assemblyMapping_folder, warnings
Exemple #27
0
def gather_data_together(sample, data_directory, sequences_information, outdir,
                         debug_mode_true):
    run_successfully = True
    counter = 0
    sample_data = {}

    consensus_files = None

    write_consensus_first_time = True

    genes_directories = [
        d for d in os.listdir(data_directory) if not d.startswith('.')
        and os.path.isdir(os.path.join(data_directory, d, ''))
    ]
    for gene_dir in genes_directories:
        gene_dir_path = os.path.join(data_directory, gene_dir, '')

        files = [
            f for f in os.listdir(gene_dir_path) if not f.startswith('.')
            and os.path.isfile(os.path.join(gene_dir_path, f))
        ]
        for file_found in files:
            if file_found.startswith('coverage_info.') and file_found.endswith(
                    '.pkl'):
                file_path = os.path.join(gene_dir_path, file_found)

                if run_successfully:
                    run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(
                        file_path)

                    if write_consensus_first_time:
                        for consensus_type in [
                                'correct', 'noMatter', 'alignment'
                        ]:
                            file_to_remove = os.path.join(
                                outdir,
                                str(sample + '.' + consensus_type + '.fasta'))
                            if os.path.isfile(file_to_remove):
                                os.remove(file_to_remove)
                        write_consensus_first_time = False
                    consensus_files = write_consensus(outdir, sample,
                                                      consensus_sequence)

                    sample_data[sequence_counter] = {
                        'header':
                        sequences_information[sequence_counter]['header'],
                        'gene_coverage':
                        100 - percentage_absent,
                        'gene_low_coverage':
                        percentage_lowCoverage,
                        'gene_number_positions_multiple_alleles':
                        multiple_alleles_found,
                        'gene_mean_read_coverage':
                        meanCoverage,
                        'gene_identity':
                        100 -
                        (float(number_diferences) /
                         sequences_information[sequence_counter]['length'])
                    }
                    counter += 1

        if not debug_mode_true:
            utils.removeDirectory(gene_dir_path)

    if counter != len(sequences_information):
        run_successfully = False

    return run_successfully, sample_data, consensus_files
Exemple #28
0
def run_assembly_mapping(fastq_files, reference_file, outdir, estimated_genome_size_mb, max_number_contigs=100,
                         save_excluded_contigs=False, min_coverage_assembly=None, keep_bam=False, threads=1):
    """
    Runs Assembly_Mapping for INNUca and QA/QC the results

    Parameters
    ----------
    fastq_files : list
        List of fastq files
    reference_file : str
        Path to the reference file (the assembly)
    outdir : str
        Path to the output directory
    estimated_genome_size_mb : float
        Expected genome size in Mb
    save_excluded_contigs : bool, default False
        True if want to save excluded contigs
    max_number_contigs : int, default 100
        Maximum number of contigs per 1.5 Mb of expected genome size
    min_coverage_assembly : int or None, default None
        Minimum contigs average coverage. After mapping reads back to the contigs, only keep contigs with at least this
        average coverage. If None is provided, 1/3 of the assembly mean coverage or 10x will be used
    keep_bam : bool, default False
        True if want to keep the BAM file produced (with mapped and unmapped reads)
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    pass_qc : bool
        Boolean stating if sample pass QA/QC or not
    time_taken : float
        Seconds that run_assembly_mapping took to run
    failing : dict
        Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys
        will be the level of failing, and values list of strings
    assembly_filtered : str or None
        Path to the filtered assembly (or original one if nothing was filtered). If something went wrong, None is
        returned
    bam_file : str or None
        Path to the BAM file to be used in subsequent steps. If something went wrong, None is returned
    assembly_mapping_folder : str
        Path to Assembly_Mapping working directory
    warnings : dict
        Dictionary with the warning reasons. If no warnings were raised, it is empty. If warnings were raised, keys
        will be the level of warnings, and values list of strings
    original_bam : str or None
        Path to the BAM file produced with reference_file if a new BAM file for subset of sequences is produced,
        else None.
    """

    pass_qc = True

    failing = {}
    warnings = {}

    assembly_mapping_folder = os.path.join(outdir, 'assemblyMapping', '')
    utils.removeDirectory(assembly_mapping_folder)
    os.mkdir(assembly_mapping_folder)

    assembly_filtered = None

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(assembly_mapping_folder, os.path.basename(reference_file))
    os.symlink(reference_file, assembly_link)

    bam_file = None
    original_bam = None
    # Index assembly using Bowtie2
    run_successfully = indexSequenceBowtie2(assembly_link, threads)

    sample_coverage_no_problems = False
    if run_successfully:
        run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link, threads, assembly_mapping_folder)

        if run_successfully:
            bam_file = os.path.splitext(sam_file)[0] + '.bam'
            run_successfully, bam_file = sortAlignment(sam_file, bam_file, False, threads)

            if run_successfully:
                os.remove(sam_file)
                run_successfully = indexAlignment(bam_file, True)

                if run_successfully:
                    sequences_2_keep = []
                    # Get assembly coverage
                    sample_coverage_no_problems, mean_coverage_data = sample_coverage(reference_file, bam_file,
                                                                                      assembly_mapping_folder, threads)
                    if sample_coverage_no_problems:
                        pass_qc_coverage, failing_reason, sequences_2_keep = \
                            save_assembly_coverage_report(mean_coverage_data, outdir, min_coverage_assembly)
                        if not pass_qc_coverage:
                            failing['Coverage'] = [failing_reason]

                        assembly_filtered = os.path.splitext(reference_file)[0] + '.mappingCov.fasta'

                        sequence_dict, ignore = utils.get_sequence_information(reference_file, 0)
                        sequence_dict, sequence_report_general = determine_sequences_to_filter(sequence_dict,
                                                                                               sequences_2_keep, False)
                        failing_sequences_filtered, minimumBP = spades.qc_assembly(sequence_report_general,
                                                                                   estimated_genome_size_mb,
                                                                                   max_number_contigs)
                        if failing_sequences_filtered['sample'] is not False:
                            warnings['Sequences_filtered'] = [failing_sequences_filtered['sample']]
                            if not minimumBP:
                                assembly_filtered = reference_file
                            else:
                                write_filtered_sequences_and_stats(sequence_dict, sequence_report_general,
                                                                   assembly_filtered, save_excluded_contigs)
                        else:
                            write_filtered_sequences_and_stats(sequence_dict, sequence_report_general,
                                                               assembly_filtered, save_excluded_contigs)
                    else:
                        failing['Coverage'] = ['Did not run']

                    # Save mapping statistics
                    sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(
                        bam_file)
                    if sample_mapping_statistics_no_problems:
                        pass_qc_mapping, failing_reason = save_mapping_statistics(dict_mapping_statistics, outdir)

                        if not pass_qc_mapping:
                            warnings['Mapping'] = [failing_reason]
                    else:
                        warnings['Mapping'] = ['Did not run']

                    if assembly_filtered is not None and \
                            assembly_filtered != reference_file and \
                            len(sequences_2_keep) > 0:
                        print('Producing bam subset for sequences to keep')
                        run_successfully, bam_subset = get_bam_subset(bam_file, sequences_2_keep, threads)
                        if run_successfully:
                            if not keep_bam:
                                os.remove(bam_file)
                            else:
                                original_bam = os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file)))
                                os.rename(bam_file, original_bam)
                            os.remove(bam_file + '.bai')
                            bam_file = bam_subset
                            run_successfully = indexAlignment(bam_file, False)
                    else:
                        if keep_bam:
                            os.rename(bam_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file))))
                            os.rename(bam_file + '.bai',
                                      os.path.join(outdir, '{}.bam.bai'.format(os.path.basename(reference_file))))
                            bam_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(reference_file)))

    if not run_successfully:
        failing['sample'] = ['Did not run']

    run_successfully = all([run_successfully, sample_coverage_no_problems])

    if len(failing) == 0:
        failing = {'sample': False}
    else:
        print('Failing:', failing)
        pass_qc = False

    return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assembly_mapping_folder, warnings, \
           original_bam
Exemple #29
0
def runSpades(sampleName, outdir, threads, fastq_files, notUseCareful,
              maxMemory, minCoverageAssembly, minContigsLength,
              estimatedGenomeSizeMb, kmers, maximumReadsLength, defaultKmers,
              minCoverageContigs, assembled_se_reads, saveExcludedContigs,
              maxNumberContigs):
    pass_qc = True
    failing = {'sample': False}
    warnings = {}

    # Create SPAdes output directory
    spades_folder = os.path.join(outdir, 'spades', '')
    utils.removeDirectory(spades_folder)
    os.mkdir(spades_folder)

    # Determine k-mers to run
    if defaultKmers:
        kmers = []
    else:
        kmers = define_kmers(kmers, maximumReadsLength)
        if len(kmers) == 0:
            print 'SPAdes will use its default k-mers'
        else:
            print 'SPAdes will use the following k-mers: ' + str(kmers)

    run_successfully, contigs = spades(spades_folder, threads, fastq_files,
                                       notUseCareful, maxMemory,
                                       minCoverageAssembly, kmers,
                                       assembled_se_reads)

    if run_successfully:
        if os.path.isfile(contigs):
            shutil.copyfile(
                contigs,
                os.path.join(outdir,
                             str('SPAdes_original_assembly.contigs.fasta')))

            contigs_link = os.path.join(outdir,
                                        str(sampleName + '.contigs.fasta'))
            os.symlink(contigs, contigs_link)

            contigs = contigs_link

            minContigsLength = define_minContigsLength(maximumReadsLength,
                                                       minContigsLength)

            sequence_dict = get_SPAdes_sequence_information(contigs)

            warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \
                decide_filter_parameters(sequence_dict, minContigsLength, minCoverageContigs, estimatedGenomeSizeMb,
                                         maxNumberContigs)

            if filtered_sequences_sufix is not None:
                filtered_sequence_file = os.path.splitext(
                    contigs)[0] + '.' + filtered_sequences_sufix + '.fasta'
                write_filtered_sequences_and_stats(sequence_dict,
                                                   spades_report_general,
                                                   contigs,
                                                   filtered_sequence_file,
                                                   sampleName, False,
                                                   saveExcludedContigs)
                contigs = filtered_sequence_file
            else:
                filtered_sequence_file = os.path.splitext(
                    contigs)[0] + '.original.fasta'
                write_filtered_sequences_and_stats(sequence_dict,
                                                   spades_report_general,
                                                   contigs,
                                                   filtered_sequence_file,
                                                   sampleName, True, False)
                contigs = filtered_sequence_file

            os.remove(contigs_link)
        else:
            run_successfully = False
            failing['sample'] = 'Assembly was not produced'
    else:
        failing['sample'] = 'Did not run'

    if not run_successfully:
        print failing['sample']
        pass_qc = False

    utils.removeDirectory(spades_folder)

    return run_successfully, pass_qc, failing, contigs, warnings
Exemple #30
0
def run_spades(sample_name, outdir, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly,
               min_contigs_length, estimated_genome_size_mb, kmers, maximum_reads_length, default_kmers,
               min_coverage_contigs, assembled_se_reads, save_excluded_contigs, max_number_contigs,
               keep_scaffolds=False, spades_version=None, estimated_coverage=None, spades_not_use_isolate=False):
    pass_qc = True
    failing = {'sample': False}
    warnings = {}

    # Create SPAdes output directory
    spades_folder = os.path.join(outdir, 'spades', '')
    utils.removeDirectory(spades_folder)
    os.mkdir(spades_folder)

    # Determine k-mers to run
    if default_kmers:
        kmers = []
    else:
        kmers = define_kmers(kmers, maximum_reads_length)
        if len(kmers) == 0:
            print('SPAdes will use its default k-mers')
        else:
            print('SPAdes will use the following k-mers: ' + str(kmers))

    run_successfully, contigs = spades(spades_folder, threads, fastq_files, not_use_careful, max_memory,
                                       min_coverage_assembly, kmers, assembled_se_reads, spades_version=spades_version,
                                       estimated_coverage=estimated_coverage,
                                       spades_not_use_isolate=spades_not_use_isolate)

    if run_successfully:
        scaffolds = os.path.join(spades_folder, 'scaffolds.fasta')
        if keep_scaffolds:
            if os.path.isfile(scaffolds):
                shutil.copyfile(scaffolds, os.path.join(outdir, str('SPAdes_original_assembly.scaffolds.fasta')))
            else:
                print('The scaffolds file was not found!')

        if os.path.isfile(contigs):
            shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta')))

            contigs_link = os.path.join(outdir, str(sample_name + '.contigs.fasta'))
            os.symlink(contigs, contigs_link)

            contigs = contigs_link

            min_contigs_length = define_minContigsLength(maximum_reads_length, min_contigs_length)

            sequence_dict = get_SPAdes_sequence_information(contigs)

            warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \
                decide_filter_parameters(sequence_dict, min_contigs_length, min_coverage_contigs,
                                         estimated_genome_size_mb, max_number_contigs)

            if filtered_sequences_sufix is not None:
                filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta'
                write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs,
                                                   filtered_sequence_file, sample_name, False, save_excluded_contigs)
                contigs = filtered_sequence_file
            else:
                filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta'
                write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs,
                                                   filtered_sequence_file, sample_name, True, False)
                contigs = filtered_sequence_file

            os.remove(contigs_link)
        else:
            run_successfully = False
            failing['sample'] = 'Assembly was not produced'
    else:
        failing['sample'] = 'Did not run'

    if not run_successfully:
        print(failing['sample'])
        pass_qc = False

    utils.removeDirectory(spades_folder)

    return run_successfully, pass_qc, failing, contigs, warnings
Exemple #31
0
def runAssemblyMapping(alignment_file, reference_file, threads, outdir,
                       minCoverageAssembly, assembly_pilon,
                       estimatedGenomeSizeMb):
    pass_qc = False
    pass_qc_coverage = False
    pass_qc_mapping = False
    pass_qc_sequences = False

    failing = {}

    assemblyMapping_folder = os.path.join(outdir, 'assemblyMapping', '')
    utils.removeDirectory(assemblyMapping_folder)
    os.mkdir(assemblyMapping_folder)

    assembly_filtered = None

    pilon_run_successfuly = True if assembly_pilon is not None else False

    # Get assembly coverage
    sample_coverage_no_problems, mean_coverage_data = sample_coverage(
        reference_file, alignment_file, assemblyMapping_folder, threads)
    if sample_coverage_no_problems:
        pass_qc_coverage, failing_reason, sequences_2_keep = save_assembly_coverage_report(
            mean_coverage_data, outdir, minCoverageAssembly)
        if not pass_qc_coverage:
            failing['Coverage'] = [failing_reason]

        assembly = reference_file if assembly_pilon is None else assembly_pilon
        assembly_filtered = os.path.splitext(assembly)[0] + '.mappingCov.fasta'

        sequence_dict = get_sequence_information(assembly)
        sequence_dict, sequence_report_general = determine_sequences_to_filter(
            sequence_dict, sequences_2_keep, pilon_run_successfuly)
        failing_sequences_filtered, minimumBP = spades.qc_assembly(
            sequence_report_general, estimatedGenomeSizeMb)
        if failing_sequences_filtered['sample'] is not False and not minimumBP:
            failing['Sequences_filtered'] = [
                failing_sequences_filtered['sample']
            ]
            assembly_filtered = assembly
        else:
            write_filtered_sequences_and_stats(sequence_dict,
                                               sequence_report_general,
                                               assembly_filtered)
            pass_qc_sequences = True

        if failing_sequences_filtered['sample'] is not False:
            print failing_sequences_filtered
    else:
        failing['Coverage'] = ['Did not run']

    # Save mapping statistics
    sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(
        alignment_file)
    if sample_mapping_statistics_no_problems:
        pass_qc_mapping, failing_reason = save_mapping_statistics(
            dict_mapping_statistics, outdir)

        if not pass_qc_mapping:
            failing['Mapping'] = [failing_reason]
    else:
        failing['Mapping'] = ['Did not run']

    run_successfully = sample_coverage_no_problems and sample_mapping_statistics_no_problems
    pass_qc = all([pass_qc_coverage, pass_qc_mapping, pass_qc_sequences])

    if not pass_qc:
        print 'Sample FAILS Assembly Mapping check with: ' + str(failing)

    utils.removeDirectory(assemblyMapping_folder)

    return run_successfully, pass_qc, failing, assembly_filtered
def runTrueCoverage(sample, fastq, reference, threads, outdir, extraSeq,
                    minCovPresence, minCovCall, minFrequencyDominantAllele,
                    minGeneCoverage, conserved_True, debug, numMapLoc,
                    minGeneIdentity, trueCoverage_config, rematch_script):
    pass_qc = False
    failing = {}

    trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '')
    utils.removeDirectory(trueCoverage_folder)
    os.mkdir(trueCoverage_folder)

    sys.path.append(
        os.path.join(os.path.dirname(rematch_script), 'modules', ''))
    import rematch_module

    # Run ReMatCh
    reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(
        reference, trueCoverage_folder, extraSeq, rematch_module)
    time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = rematch_module.runRematchModule(
        sample, fastq, reference_file, threads, trueCoverage_folder, extraSeq,
        minCovPresence, minCovCall, minFrequencyDominantAllele,
        minGeneCoverage, True, debug, 1, minGeneIdentity, 'first', 7, 'none',
        reference_dict, 'X', None, gene_list_reference, True)

    if run_successfully:
        print 'Writing report file'
        os.rename(
            os.path.join(trueCoverage_folder, 'rematchModule_report.txt'),
            os.path.join(outdir, 'trueCoverage_report.txt'))

        if sample_data_general['number_absent_genes'] > trueCoverage_config[
                'maximum_number_absent_genes']:
            failing['absent_genes'] = 'The number of absent genes (' + str(
                sample_data_general['number_absent_genes']
            ) + ') exceeds the maximum allowed (' + str(
                trueCoverage_config['maximum_number_absent_genes']) + ')'
        if sample_data_general[
                'number_genes_multiple_alleles'] > trueCoverage_config[
                    'maximum_number_genes_multiple_alleles']:
            failing[
                'multiple_alleles'] = 'The number of genes with multiple alleles (' + str(
                    sample_data_general['number_genes_multiple_alleles']
                ) + ') exceeds the maximum allowed (' + str(
                    trueCoverage_config[
                        'maximum_number_genes_multiple_alleles']) + ')'
        if sample_data_general['mean_sample_coverage'] < trueCoverage_config[
                'minimum_read_coverage']:
            failing[
                'read_coverage'] = 'The mean read coverage for genes present (' + str(
                    sample_data_general['mean_sample_coverage']
                ) + ') dit not meet the minimum required (' + str(
                    trueCoverage_config['minimum_read_coverage']) + ')'
    else:
        failing['sample'] = 'Did not run'

    if len(failing) == 0:
        pass_qc = True
        failing['sample'] = False
    else:
        print failing

    if not debug:
        utils.removeDirectory(trueCoverage_folder)

    return run_successfully, pass_qc, failing
Exemple #33
0
def run_assembly_mapping(fastq_files,
                         reference_file,
                         outdir,
                         estimated_genome_size_mb,
                         max_number_contigs=100,
                         save_excluded_contigs=False,
                         min_coverage_assembly=None,
                         keep_bam=False,
                         threads=1):
    """
    Runs Assembly_Mapping for INNUca and QA/QC the results

    Parameters
    ----------
    fastq_files : list
        List of fastq files
    reference_file : str
        Path to the reference file (the assembly)
    outdir : str
        Path to the output directory
    estimated_genome_size_mb : float
        Expected genome size in Mb
    save_excluded_contigs : bool, default False
        True if want to save excluded contigs
    max_number_contigs : int, default 100
        Maximum number of contigs per 1.5 Mb of expected genome size
    min_coverage_assembly : int or None, default None
        Minimum contigs average coverage. After mapping reads back to the contigs, only keep contigs with at least this
        average coverage. If None is provided, 1/3 of the assembly mean coverage or 10x will be used
    keep_bam : bool, default False
        True if want to keep the BAM file produced (with mapped and unmapped reads)
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    pass_qc : bool
        Boolean stating if sample pass QA/QC or not
    time_taken : float
        Seconds that run_assembly_mapping took to run
    failing : dict
        Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys
        will be the level of failing, and values list of strings
    assembly_filtered : str or None
        Path to the filtered assembly (or original one if nothing was filtered). If something went wrong, None is
        returned
    bam_file : str or None
        Path to the BAM file to be used in subsequent steps. If something went wrong, None is returned
    assembly_mapping_folder : str
        Path to Assembly_Mapping working directory
    warnings : dict
        Dictionary with the warning reasons. If no warnings were raised, it is empty. If warnings were raised, keys
        will be the level of warnings, and values list of strings
    original_bam : str or None
        Path to the BAM file produced with reference_file if a new BAM file for subset of sequences is produced,
        else None.
    """

    pass_qc = True

    failing = {}
    warnings = {}

    assembly_mapping_folder = os.path.join(outdir, 'assemblyMapping', '')
    utils.removeDirectory(assembly_mapping_folder)
    os.mkdir(assembly_mapping_folder)

    assembly_filtered = None

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(assembly_mapping_folder,
                                 os.path.basename(reference_file))
    os.symlink(reference_file, assembly_link)

    bam_file = None
    original_bam = None
    # Index assembly using Bowtie2
    run_successfully = indexSequenceBowtie2(assembly_link, threads)

    sample_coverage_no_problems = False
    if run_successfully:
        run_successfully, sam_file = mappingBowtie2(fastq_files, assembly_link,
                                                    threads,
                                                    assembly_mapping_folder)

        if run_successfully:
            bam_file = os.path.splitext(sam_file)[0] + '.bam'
            run_successfully, bam_file = sortAlignment(sam_file, bam_file,
                                                       False, threads)

            if run_successfully:
                os.remove(sam_file)
                run_successfully = indexAlignment(bam_file, True)

                if run_successfully:
                    sequences_2_keep = []
                    # Get assembly coverage
                    sample_coverage_no_problems, mean_coverage_data = sample_coverage(
                        reference_file, bam_file, assembly_mapping_folder,
                        threads)
                    if sample_coverage_no_problems:
                        pass_qc_coverage, failing_reason, sequences_2_keep = \
                            save_assembly_coverage_report(mean_coverage_data, outdir, min_coverage_assembly)
                        if not pass_qc_coverage:
                            failing['Coverage'] = [failing_reason]

                        assembly_filtered = os.path.splitext(
                            reference_file)[0] + '.mappingCov.fasta'

                        sequence_dict, ignore = utils.get_sequence_information(
                            reference_file, 0)
                        sequence_dict, sequence_report_general = determine_sequences_to_filter(
                            sequence_dict, sequences_2_keep, False)
                        failing_sequences_filtered, minimumBP = spades.qc_assembly(
                            sequence_report_general, estimated_genome_size_mb,
                            max_number_contigs)
                        if failing_sequences_filtered['sample'] is not False:
                            warnings['Sequences_filtered'] = [
                                failing_sequences_filtered['sample']
                            ]
                            if not minimumBP:
                                assembly_filtered = reference_file
                            else:
                                write_filtered_sequences_and_stats(
                                    sequence_dict, sequence_report_general,
                                    assembly_filtered, save_excluded_contigs)
                        else:
                            write_filtered_sequences_and_stats(
                                sequence_dict, sequence_report_general,
                                assembly_filtered, save_excluded_contigs)
                    else:
                        failing['Coverage'] = ['Did not run']

                    # Save mapping statistics
                    sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(
                        bam_file)
                    if sample_mapping_statistics_no_problems:
                        pass_qc_mapping, failing_reason = save_mapping_statistics(
                            dict_mapping_statistics, outdir)

                        if not pass_qc_mapping:
                            warnings['Mapping'] = [failing_reason]
                    else:
                        warnings['Mapping'] = ['Did not run']

                    if assembly_filtered is not None and \
                            assembly_filtered != reference_file and \
                            len(sequences_2_keep) > 0:
                        print('Producing bam subset for sequences to keep')
                        run_successfully, bam_subset = get_bam_subset(
                            bam_file, sequences_2_keep, threads)
                        if run_successfully:
                            if not keep_bam:
                                os.remove(bam_file)
                            else:
                                original_bam = os.path.join(
                                    outdir, '{}.bam'.format(
                                        os.path.basename(reference_file)))
                                os.rename(bam_file, original_bam)
                            os.remove(bam_file + '.bai')
                            bam_file = bam_subset
                            run_successfully = indexAlignment(bam_file, False)
                    else:
                        if keep_bam:
                            os.rename(
                                bam_file,
                                os.path.join(
                                    outdir, '{}.bam'.format(
                                        os.path.basename(reference_file))))
                            os.rename(
                                bam_file + '.bai',
                                os.path.join(
                                    outdir, '{}.bam.bai'.format(
                                        os.path.basename(reference_file))))
                            bam_file = os.path.join(
                                outdir, '{}.bam'.format(
                                    os.path.basename(reference_file)))

    if not run_successfully:
        failing['sample'] = ['Did not run']

    run_successfully = all([run_successfully, sample_coverage_no_problems])

    if len(failing) == 0:
        failing = {'sample': False}
    else:
        print('Failing:', failing)
        pass_qc = False

    return run_successfully, pass_qc, failing, assembly_filtered, bam_file, assembly_mapping_folder, warnings, \
           original_bam
Exemple #34
0
def runTrueCoverage(fastq_files, reference_file, threads, outdir,
                    length_extra_seq, minimum_depth_presence,
                    minimum_depth_call,
                    minimum_depth_frequency_dominant_allele,
                    minimum_gene_coverage, maximum_number_absent_genes,
                    maximum_number_genes_multiple_alleles,
                    minimum_read_coverage):
    pass_qc = False
    failing = {}

    trueCoverage_folder = os.path.join(outdir, 'trueCoverage', '')
    utils.removeDirectory(trueCoverage_folder)
    os.mkdir(trueCoverage_folder)

    # Map reads
    run_successfully, bam_file, reference_file = mapping_reads(
        fastq_files, reference_file, threads, trueCoverage_folder)

    if run_successfully:
        # Index reference file
        run_successfully = index_fasta_samtools(reference_file)
        if run_successfully:
            run_successfully, sample_data = sequence_data(
                reference_file, bam_file, trueCoverage_folder, threads,
                length_extra_seq, minimum_depth_presence, minimum_depth_call,
                minimum_depth_frequency_dominant_allele)

            if run_successfully:
                number_absent_genes = 0
                number_genes_multiple_alleles = 0
                mean_sample_coverage = 0
                with open(os.path.join(outdir, 'trueCoverage_report.txt'),
                          'wt') as writer:
                    writer.write('\t'.join([
                        '#gene', 'percentage_gene_coverage',
                        'gene_mean_read_coverage',
                        'percentage_gene_low_coverage',
                        'number_positions_multiple_alleles'
                    ]) + '\n')
                    for i in range(1, len(sample_data) + 1):
                        writer.write('\t'.join([
                            sample_data[i]['header'],
                            str(round(sample_data[i]['gene_coverage'], 2)),
                            str(
                                round(
                                    sample_data[i]['gene_mean_read_coverage'],
                                    2)),
                            str(round(sample_data[i]['gene_low_coverage'], 2)),
                            str(sample_data[i]
                                ['gene_number_positions_multiple_alleles'])
                        ]) + '\n')

                        if sample_data[i][
                                'gene_coverage'] < minimum_gene_coverage:
                            number_absent_genes += 1
                        else:
                            mean_sample_coverage += sample_data[i][
                                'gene_mean_read_coverage']
                            if sample_data[i][
                                    'gene_number_positions_multiple_alleles'] > 0:
                                number_genes_multiple_alleles += 1

                    if len(sample_data) - number_absent_genes > 0:
                        mean_sample_coverage = float(
                            mean_sample_coverage) / float(
                                len(sample_data) - number_absent_genes)
                    else:
                        mean_sample_coverage = 0

                    writer.write('\n'.join([
                        '#general', '>number_absent_genes',
                        str(number_absent_genes),
                        '>number_genes_multiple_alleles',
                        str(number_genes_multiple_alleles),
                        '>mean_sample_coverage',
                        str(round(mean_sample_coverage, 2))
                    ]) + '\n')

                    print '\n'.join([
                        str('number_absent_genes: ' +
                            str(number_absent_genes)),
                        str('number_genes_multiple_alleles: ' +
                            str(number_genes_multiple_alleles)),
                        str('mean_sample_coverage: ' +
                            str(round(mean_sample_coverage, 2)))
                    ])

                if number_absent_genes > maximum_number_absent_genes:
                    failing[
                        'absent_genes'] = 'The number of absent genes (' + str(
                            number_absent_genes
                        ) + ') exceeds the maximum allowed (' + str(
                            maximum_number_absent_genes) + ')'
                if number_genes_multiple_alleles > maximum_number_genes_multiple_alleles:
                    failing[
                        'multiple_alleles'] = 'The number of genes with multiple alleles (' + str(
                            number_genes_multiple_alleles
                        ) + ') exceeds the maximum allowed (' + str(
                            maximum_number_genes_multiple_alleles) + ')'
                if mean_sample_coverage < minimum_read_coverage:
                    failing[
                        'read_coverage'] = 'The mean read coverage for genes present (' + str(
                            mean_sample_coverage
                        ) + ') dit not meet the minimum required (' + str(
                            minimum_read_coverage) + ')'
            else:
                failing['sample'] = 'Did not run'
        else:
            failing['sample'] = 'Did not run'
    else:
        failing['sample'] = 'Did not run'

    if len(failing) == 0:
        pass_qc = True
        failing['sample'] = False
    else:
        print failing

    utils.removeDirectory(trueCoverage_folder)

    return run_successfully, pass_qc, failing
Exemple #35
0
def run_pilon(jar_path_pilon, assembly, fastq_files, outdir, jar_max_memory, alignment_file, keep_bam=False, threads=1):
    """
    Runs Assembly_Mapping for INNUca and QA/QC the results

    Parameters
    ----------
    jar_path_pilon
    assembly : str
        Path to the assembly to correct
    fastq_files : list
        List of fastq files
    outdir : str
        Path to the output directory
    jar_max_memory : int or 'off'
        If not 'off' is provided, sets the maximum RAM Gb usage by jar files
    alignment_file : str or None
        Path to the BAM file to be used. If None is provided, new alignment reads will be performed
    keep_bam : bool, default False
        True if want to keep the BAM file produced (with mapped and unmapped reads)
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    pass_qc : None
        QA/QC not performed
    time_taken : float
        Seconds that run_assembly_mapping took to run
    failing : dict
        Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys
        will be the level of failing, and values list of strings
    assembly_polished : str or None
        Path to the polished assembly. If something went wrong, None is returned
    pilon_folder : str
        Path to Pilon working directory
    new_bam : bool
        True if new alignment reads was performed
    alignment_file : str or None
        Path to the BAM file used to correct the assembly. If something went wrong, None is returned.
    """

    failing = {'sample': False}

    pilon_folder = os.path.join(outdir, 'pilon', '')
    utils.removeDirectory(pilon_folder)
    os.mkdir(pilon_folder)

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(pilon_folder, os.path.basename(assembly))
    os.symlink(assembly, assembly_link)

    run_successfully = True

    new_bam = False
    if alignment_file is None:
        # Index assembly using Bowtie2
        run_successfully = indexSequenceBowtie2(assembly_link, threads)

        if run_successfully:
            # mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1
            run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=assembly_link,
                                                         outdir=pilon_folder, keep_bam=keep_bam, threads=threads)

            if run_successfully:
                alignment_file = os.path.splitext(sam_file)[0] + '.bam'
                run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads)

                if run_successfully:
                    os.remove(sam_file)
                    run_successfully = indexAlignment(alignment_file)
                    new_bam = True
                else:
                    alignment_file = None

    assembly_polished = None

    if run_successfully:
        run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder,
                                                    jar_max_memory)

        if run_successfully:
            parsePilonResult(assembly_polished, outdir)
            os.rename(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished)))
            assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished))
            if keep_bam and new_bam:
                os.rename(alignment_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly))))
                alignment_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly)))

    if alignment_file is not None and os.path.isfile(str(alignment_file)) and not keep_bam:
        os.remove(alignment_file)

    if not run_successfully:
        failing['sample'] = 'Did not run'
        print failing['sample']

    return run_successfully, None, failing, assembly_polished, pilon_folder, new_bam, alignment_file
Exemple #36
0
def runRematchModule(sample, fastq_files, reference_file, threads, outdir,
                     length_extra_seq, minimum_depth_presence,
                     minimum_depth_call,
                     minimum_depth_frequency_dominant_allele,
                     minimum_gene_coverage, conserved_True, debug_mode_true,
                     numMapLoc, minimum_gene_identity):
    rematch_folder = os.path.join(outdir, 'rematch_module', '')
    utils.removeDirectory(rematch_folder)
    os.mkdir(rematch_folder)

    # Map reads
    run_successfully, bam_file, reference_file = mapping_reads(
        fastq_files, reference_file, threads, rematch_folder, conserved_True,
        numMapLoc)

    if run_successfully:
        # Index reference file
        run_successfully, stdout = index_fasta_samtools(
            reference_file, None, None, True)
        if run_successfully:
            print 'Analysing alignment data'
            run_successfully, sample_data, consensus_files = sequence_data(
                sample, reference_file, bam_file, rematch_folder, threads,
                length_extra_seq, minimum_depth_presence, minimum_depth_call,
                minimum_depth_frequency_dominant_allele, debug_mode_true)

            if run_successfully:
                print 'Writing report file'
                number_absent_genes = 0
                number_genes_multiple_alleles = 0
                mean_sample_coverage = 0
                with open(os.path.join(outdir, 'rematchModule_report.txt'),
                          'wt') as writer:
                    writer.write('\t'.join([
                        '#gene', 'percentage_gene_coverage',
                        'gene_mean_read_coverage',
                        'percentage_gene_low_coverage',
                        'number_positions_multiple_alleles',
                        'percentage_gene_identity'
                    ]) + '\n')
                    for i in range(1, len(sample_data) + 1):
                        writer.write('\t'.join([
                            sample_data[i]['header'],
                            str(round(sample_data[i]['gene_coverage'], 2)),
                            str(
                                round(
                                    sample_data[i]['gene_mean_read_coverage'],
                                    2)),
                            str(round(sample_data[i]['gene_low_coverage'], 2)),
                            str(sample_data[i]
                                ['gene_number_positions_multiple_alleles']),
                            str(round(sample_data[i]['gene_identity'], 2))
                        ]) + '\n')

                        if sample_data[i][
                                'gene_coverage'] < minimum_gene_coverage or sample_data[
                                    i]['gene_identity'] < minimum_gene_identity:
                            number_absent_genes += 1
                        else:
                            mean_sample_coverage += sample_data[i][
                                'gene_mean_read_coverage']
                            if sample_data[i][
                                    'gene_number_positions_multiple_alleles'] > 0:
                                number_genes_multiple_alleles += 1

                    if len(sample_data) - number_absent_genes > 0:
                        mean_sample_coverage = float(
                            mean_sample_coverage) / float(
                                len(sample_data) - number_absent_genes)
                    else:
                        mean_sample_coverage = 0

                    writer.write('\n'.join([
                        '#general', '>number_absent_genes',
                        str(number_absent_genes),
                        '>number_genes_multiple_alleles',
                        str(number_genes_multiple_alleles),
                        '>mean_sample_coverage',
                        str(round(mean_sample_coverage, 2))
                    ]) + '\n')

                    print '\n'.join([
                        str('number_absent_genes: ' +
                            str(number_absent_genes)),
                        str('number_genes_multiple_alleles: ' +
                            str(number_genes_multiple_alleles)),
                        str('mean_sample_coverage: ' +
                            str(round(mean_sample_coverage, 2)))
                    ])

    if not debug_mode_true:
        utils.removeDirectory(rematch_folder)

    return run_successfully, sample_data if 'sample_data' in locals(
    ) else None, {
        'number_absent_genes': number_absent_genes,
        'number_genes_multiple_alleles': number_genes_multiple_alleles,
        'mean_sample_coverage': round(mean_sample_coverage, 2)
    } if 'number_absent_genes' in locals(
    ) else None, consensus_files if 'consensus_files' in locals() else None
Exemple #37
0
def run_spades(sample_name, outdir, threads, fastq_files, not_use_careful, max_memory, min_coverage_assembly,
               min_contigs_length, estimated_genome_size_mb, kmers, maximum_reads_length, default_kmers,
               min_coverage_contigs, assembled_se_reads, save_excluded_contigs, max_number_contigs,
               keep_scaffolds=False):
    pass_qc = True
    failing = {'sample': False}
    warnings = {}

    # Create SPAdes output directory
    spades_folder = os.path.join(outdir, 'spades', '')
    utils.removeDirectory(spades_folder)
    os.mkdir(spades_folder)

    # Determine k-mers to run
    if default_kmers:
        kmers = []
    else:
        kmers = define_kmers(kmers, maximum_reads_length)
        if len(kmers) == 0:
            print('SPAdes will use its default k-mers')
        else:
            print('SPAdes will use the following k-mers: ' + str(kmers))

    run_successfully, contigs = spades(spades_folder, threads, fastq_files, not_use_careful, max_memory,
                                       min_coverage_assembly, kmers, assembled_se_reads)

    if run_successfully:
        scaffolds = os.path.join(spades_folder, 'scaffolds.fasta')
        if keep_scaffolds:
            if os.path.isfile(scaffolds):
                shutil.copyfile(scaffolds, os.path.join(outdir, str('SPAdes_original_assembly.scaffolds.fasta')))
            else:
                print('The scaffolds file was not found!')

        if os.path.isfile(contigs):
            shutil.copyfile(contigs, os.path.join(outdir, str('SPAdes_original_assembly.contigs.fasta')))

            contigs_link = os.path.join(outdir, str(sample_name + '.contigs.fasta'))
            os.symlink(contigs, contigs_link)

            contigs = contigs_link

            min_contigs_length = define_minContigsLength(maximum_reads_length, min_contigs_length)

            sequence_dict = get_SPAdes_sequence_information(contigs)

            warnings, sequence_dict, filtered_sequences_sufix, spades_report_general = \
                decide_filter_parameters(sequence_dict, min_contigs_length, min_coverage_contigs,
                                         estimated_genome_size_mb, max_number_contigs)

            if filtered_sequences_sufix is not None:
                filtered_sequence_file = os.path.splitext(contigs)[0] + '.' + filtered_sequences_sufix + '.fasta'
                write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs,
                                                   filtered_sequence_file, sample_name, False, save_excluded_contigs)
                contigs = filtered_sequence_file
            else:
                filtered_sequence_file = os.path.splitext(contigs)[0] + '.original.fasta'
                write_filtered_sequences_and_stats(sequence_dict, spades_report_general, contigs,
                                                   filtered_sequence_file, sample_name, True, False)
                contigs = filtered_sequence_file

            os.remove(contigs_link)
        else:
            run_successfully = False
            failing['sample'] = 'Assembly was not produced'
    else:
        failing['sample'] = 'Did not run'

    if not run_successfully:
        print(failing['sample'])
        pass_qc = False

    utils.removeDirectory(spades_folder)

    return run_successfully, pass_qc, failing, contigs, warnings
Exemple #38
0
def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform,
                 sra, sra_opt):
    download_dir = os.path.join(outdir, 'download', '')
    utils.removeDirectory(download_dir)
    os.mkdir(download_dir)

    run_successfully = False
    downloaded_files = None
    sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
                              'library_layout': None, 'library_source': None, 'extra_run_accession': None,
                              'nominal_length': None, 'read_count': None, 'base_count': None,
                              'date_download': time.strftime("%Y-%m-%d")}

    read_run_info = get_read_run_info(ena_id)
    if read_run_info is not None:
        download_information = get_download_information(read_run_info)
        download_information = check_correct_links(download_information)
        sequencing_information = get_sequencing_information(read_run_info)

        if instrument_platform.lower() == 'all' or \
                (sequencing_information['instrument_platform'] is not None and
                 sequencing_information['instrument_platform'].lower() == instrument_platform.lower()):
            if download_paired_type.lower() == 'both' or \
                    (sequencing_information['library_layout'] is not None and
                     sequencing_information['library_layout'].lower() == download_paired_type.lower()):
                run_successfully, cram_index_run_successfully, download_sra = download_files(download_information,
                                                                                             aspera_key, download_dir,
                                                                                             download_cram_bam_true,
                                                                                             sra, sra_opt, ena_id)
                if download_sra:
                    run_successfully = sra_2_fastq(download_dir, ena_id)
                if run_successfully:
                    run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
                                                                         threads,
                                                                         sequencing_information['library_layout'])
                if run_successfully and downloaded_files is not None:
                    run_successfully, downloaded_files = rename_move_files(downloaded_files,
                                                                           sequencing_information['run_accession'],
                                                                           outdir,
                                                                           sequencing_information['library_layout'])
    else:
        if sra or sra_opt:
            run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None,
                                                                                          'submitted': None,
                                                                                          'cram_index': None},
                                                                                         aspera_key, download_dir,
                                                                                         download_cram_bam_true, sra,
                                                                                         sra_opt, ena_id)
            if download_sra:
                run_successfully = sra_2_fastq(download_dir, ena_id)
            if run_successfully:
                run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads,
                                                                     'paired')
                if not run_successfully:
                    run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
                                                                         threads, 'single')
            if run_successfully and downloaded_files is not None:
                run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired')
                if not run_successfully:
                    run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single')

    utils.removeDirectory(download_dir)

    return run_successfully, downloaded_files, sequencing_information
Exemple #39
0
def run_pilon(jar_path_pilon, assembly, fastq_files, outdir, jar_max_memory, alignment_file, keep_bam=False, threads=1):
    """
    Runs Assembly_Mapping for INNUca and QA/QC the results

    Parameters
    ----------
    jar_path_pilon : str
        Path to the Pilon jar file that will be executed
    assembly : str
        Path to the assembly to correct
    fastq_files : list
        List of fastq files
    outdir : str
        Path to the output directory
    jar_max_memory : int or 'off'
        If not 'off' is provided, sets the maximum RAM Gb usage by jar files
    alignment_file : str or None
        Path to the BAM file to be used. If None is provided, new alignment reads will be performed
    keep_bam : bool, default False
        True if want to keep the BAM file produced (with mapped and unmapped reads)
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    pass_qc : None
        QA/QC not performed
    time_taken : float
        Seconds that run_assembly_mapping took to run
    failing : dict
        Dictionary with the failing reasons. If sample did not fail, it is only {'sample': False}. If it failed, keys
        will be the level of failing, and values list of strings
    assembly_polished : str or None
        Path to the polished assembly. If something went wrong, None is returned
    pilon_folder : str
        Path to Pilon working directory
    new_bam : bool
        True if new alignment reads was performed
    alignment_file : str or None
        Path to the BAM file used to correct the assembly. If something went wrong, None is returned.
    """

    failing = {'sample': False}

    pilon_folder = os.path.join(outdir, 'pilon', '')
    utils.removeDirectory(pilon_folder)
    os.mkdir(pilon_folder)

    # Create a symbolic link to the assembly
    assembly_link = os.path.join(pilon_folder, os.path.basename(assembly))
    os.symlink(assembly, assembly_link)

    run_successfully = True

    new_bam = False
    if alignment_file is None:
        # Index assembly using Bowtie2
        run_successfully = indexSequenceBowtie2(assembly_link, threads)

        if run_successfully:
            # mapping_bowtie2(fastq_files, reference_file, outdir, keep_bam=False, threads=1
            run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=assembly_link,
                                                         outdir=pilon_folder, keep_bam=keep_bam, threads=threads)

            if run_successfully:
                alignment_file = os.path.splitext(sam_file)[0] + '.bam'
                run_successfully, alignment_file = sortAlignment(sam_file, alignment_file, False, threads)

                if run_successfully:
                    os.remove(sam_file)
                    run_successfully = indexAlignment(alignment_file)
                    new_bam = True
                else:
                    alignment_file = None

    assembly_polished = None

    if run_successfully:
        run_successfully, assembly_polished = pilon(jar_path_pilon, assembly_link, alignment_file, pilon_folder,
                                                    jar_max_memory)

        if run_successfully:
            parsePilonResult(assembly_polished, outdir)
            os.rename(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished)))
            assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished))
            write_assembly_statistics(assembly=assembly_polished, outdir=outdir)
            if keep_bam and new_bam:
                os.rename(alignment_file, os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly))))
                alignment_file = os.path.join(outdir, '{}.bam'.format(os.path.basename(assembly)))

    if alignment_file is not None and os.path.isfile(str(alignment_file)) and not keep_bam:
        os.remove(alignment_file)

    if not run_successfully:
        failing['sample'] = 'Did not run'
        print(failing['sample'])

    return run_successfully, None, failing, assembly_polished, pilon_folder, new_bam, alignment_file