def getEstimatedCoverage(fastq_files, estimatedGenomeSizeMb, outdir, threads, estimatedMinimumCoverage): run_successfully = False pass_qc = False failing = {} failing['sample'] = False # Run Estimated Coverage estimatedCoverage = None # Get number bases for each fastq file pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(countSequencedBases, args=(fastq, outdir,)) pool.close() pool.join() numberBases = 0 file_problems = False files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))] for file_found in files: if file_found.startswith('estimate_coverage.') and file_found.endswith('.pkl'): file_path = os.path.join(outdir, file_found) if not file_problems: run_successfully, bases = utils.extractVariableFromPickle(file_path) if run_successfully: numberBases += bases else: file_problems = True os.remove(file_path) if run_successfully: estimatedCoverage = numberBases / float(estimatedGenomeSizeMb * 1000000) estimatedCoverage = round(estimatedCoverage, 1) report_file = os.path.join(outdir, 'coverage_report.txt') report = str(estimatedCoverage) + 'x' if not os.path.isfile(report_file): writer = open(report_file, 'wt') else: writer = open(report_file, 'at') writer.write(report + '\n') writer.flush() writer.close() report = 'Estimated depth coverage: ' + str(estimatedCoverage) + 'x' if estimatedCoverage >= estimatedMinimumCoverage: pass_qc = True print report else: failing['sample'] = report + ' (lower than ' + str(estimatedMinimumCoverage) + 'x)' print failing['sample'] else: failing['sample'] = 'Did not run' print failing['sample'] return run_successfully, pass_qc, failing, estimatedCoverage
def sample_coverage(referenceFile, alignment_file, outdir, threads): coverage_outdir = os.path.join(outdir, 'samtools_depth', '') utils.removeDirectory(coverage_outdir) os.makedirs(coverage_outdir) sequences = sequenceHeaders(referenceFile) pool = multiprocessing.Pool(processes=threads) counter = 0 for sequence in sequences: pool.apply_async(get_sequence_coverage, args=( alignment_file, sequence, coverage_outdir, counter, )) counter += 1 pool.close() pool.join() sample_coverage_no_problems = True mean_coverage_data = {} files = [ f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f)) ] for file_found in files: if file_found.startswith('coverage.sequence_') and file_found.endswith( '.pkl'): file_path = os.path.join(coverage_outdir, file_found) if sample_coverage_no_problems: sequence_to_analyse, run_successfully, problems_found, position, coverage = \ utils.extractVariableFromPickle(file_path) if run_successfully and not problems_found: mean_coverage_data[sequence_to_analyse] = { 'position': position, 'coverage': coverage, 'mean_coverage': round((float(coverage) / float(position)), 2) } else: print( 'WARNING: it was not possible to compute coverage information for' ' sequence ' + sequence_to_analyse) sample_coverage_no_problems = False os.remove(file_path) return sample_coverage_no_problems, mean_coverage_data
def runFastQintegrity(fastq_files, threads, outdir): pass_qc = True failing = {} failing['sample'] = False not_corruption_found = True fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '') utils.removeDirectory(fastQintegrity_folder) os.mkdir(fastQintegrity_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(fastQintegrity, args=(fastq, fastQintegrity_folder,)) pool.close() pool.join() encoding = {} files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))] for file_found in files: if file_found.endswith('.pkl'): file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found)) if file_run_successfully: encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length} else: failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt'] print os.path.splitext(file_found)[0] + ': the file is possibly corrupt' os.remove(os.path.join(fastQintegrity_folder, file_found)) if len(failing) > 1: failing.pop('sample') not_corruption_found = False pass_qc = False min_reads_length, max_reads_length = None, None if len(encoding) == 0: encoding = None print 'It was no possible to determine the FASTQ encodings' else: min_reads_length, max_reads_length = guess_encoding.determine_min_max_reads_length(encoding) if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1: encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0] print 'Fastq quality encoding: ' + str(encoding) else: print 'It was no possible to determine the FASTQ encodings' print 'This was what has been found: ' + str(encoding) encoding = None utils.removeDirectory(fastQintegrity_folder) return not_corruption_found, pass_qc, failing, encoding, min_reads_length, max_reads_length
def gather_data_together(data_directory): data = {} files = [f for f in os.listdir(data_directory) if not f.startswith('.') and os.path.isfile(os.path.join(data_directory, f))] for file_found in files: if file_found.startswith('encoding.') and file_found.endswith('.pkl'): file_path = os.path.join(data_directory, file_found) fastq, valid_encodings, min_reads_length, max_reads_length = utils.extractVariableFromPickle(file_path) data[fastq] = {'valid_encodings': valid_encodings, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length} os.remove(file_path) return data
def getPickleRunSuccessfully(directory, pickle_prefix): run_successfully = True read_pickle = False files = findFiles(directory, pickle_prefix, '.pkl') if files is not None: for file_found in files: if run_successfully: run_successfully = utils.extractVariableFromPickle(file_found) read_pickle = True os.remove(file_found) if not read_pickle: run_successfully = False return run_successfully
def get_compressed_decompressed_reads(outdir): run_successfully = True reads = {} counter = 0 files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))] for file_found in files: if file_found.endswith('.pkl'): file_path = os.path.join(outdir, file_found) if run_successfully: run_successfully, decompressed_file, length_sequence = utils.extractVariableFromPickle(file_path) if run_successfully: reads[counter] = [decompressed_file, length_sequence] counter += 1 os.remove(file_path) return run_successfully, [reads[i][0] for i in reads]
def runFastQintegrity(fastq_files, threads, outdir): failing = {} failing['sample'] = False not_corruption_found = True fastQintegrity_folder = os.path.join(outdir, 'fastQintegrity', '') utils.removeDirectory(fastQintegrity_folder) os.mkdir(fastQintegrity_folder) pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: pool.apply_async(fastQintegrity, args=( fastq, fastQintegrity_folder, )) pool.close() pool.join() files = [ f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f)) ] for file_found in files: if file_found.endswith('.pkl'): file_run_successfully = utils.extractVariableFromPickle( os.path.join(fastQintegrity_folder, file_found)) if not file_run_successfully: failing[os.path.splitext(file_found)[0]] = [ 'The file is possibly corrupt' ] print os.path.splitext( file_found)[0] + ': the file is possibly corrupt' os.remove(os.path.join(fastQintegrity_folder, file_found)) if len(failing) > 1: failing.pop('sample') not_corruption_found = False utils.removeDirectory(fastQintegrity_folder) return not_corruption_found, None, failing # None added for consistency with other steps
def sample_coverage(referenceFile, alignment_file, outdir, threads): coverage_outdir = os.path.join(outdir, 'samtools_depth', '') utils.removeDirectory(coverage_outdir) os.makedirs(coverage_outdir) sequences = sequenceHeaders(referenceFile) pool = multiprocessing.Pool(processes=threads) counter = 0 for sequence in sequences: pool.apply_async(get_sequence_coverage, args=(alignment_file, sequence, coverage_outdir, counter,)) counter += 1 pool.close() pool.join() sample_coverage_no_problems = True mean_coverage_data = {} files = [f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f))] for file_found in files: if file_found.startswith('coverage.sequence_') and file_found.endswith('.pkl'): file_path = os.path.join(coverage_outdir, file_found) if sample_coverage_no_problems: sequence_to_analyse, run_successfully, problems_found, position, coverage = \ utils.extractVariableFromPickle(file_path) if run_successfully and not problems_found: mean_coverage_data[sequence_to_analyse] = {'position': position, 'coverage': coverage, 'mean_coverage': round( (float(coverage) / float(position)), 2)} else: print('WARNING: it was not possible to compute coverage information for' ' sequence ' + sequence_to_analyse) sample_coverage_no_problems = False os.remove(file_path) return sample_coverage_no_problems, mean_coverage_data
def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true): run_successfully = True counter = 0 sample_data = {} consensus_files = None write_consensus_first_time = True genes_directories = [ d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, '')) ] for gene_dir in genes_directories: gene_dir_path = os.path.join(data_directory, gene_dir, '') files = [ f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f)) ] for file_found in files: if file_found.startswith('coverage_info.') and file_found.endswith( '.pkl'): file_path = os.path.join(gene_dir_path, file_found) if run_successfully: run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle( file_path) if write_consensus_first_time: for consensus_type in [ 'correct', 'noMatter', 'alignment' ]: file_to_remove = os.path.join( outdir, str(sample + '.' + consensus_type + '.fasta')) if os.path.isfile(file_to_remove): os.remove(file_to_remove) write_consensus_first_time = False consensus_files = write_consensus(outdir, sample, consensus_sequence) sample_data[sequence_counter] = { 'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - percentage_absent, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': 100 - (float(number_diferences) / sequences_information[sequence_counter]['length']) } counter += 1 if not debug_mode_true: utils.removeDirectory(gene_dir_path) if counter != len(sequences_information): run_successfully = False return run_successfully, sample_data, consensus_files
def runCampyGenomes(args): start_time = time.time() listRunIDs = utils.getListIDs(os.path.abspath(args.listRunIDs.name)) outdir = os.path.abspath(args.outdir) utils.check_create_directory(outdir) asperaKey = args.asperaKey.name threads_to_use = [j for j in general_threads_to_use if j <= args.threads] # Start logger logfile, time_str = utils.start_logger(outdir) # Get general information utils.general_information(logfile, version, outdir, time_str) # Check programms requiredPrograms() # Randomize the list with Run IDs random.shuffle(listRunIDs) number_process = determineNumberProcess(threads_to_use) samples_each_threads = determineBatchSamples(listRunIDs, threads_to_use) run_successfully = 0 with open( os.path.join(outdir, 'samples_with_problems.' + time_str + '.tab'), 'wt') as writer_success: with open(os.path.join(outdir, 'running_times.' + time_str + '.tab'), 'wt') as writer_times: for threads in samples_each_threads: print '\n' + 'Running for ' + str(threads) + ' threads' + '\n' threads_dir = os.path.join(outdir, str(threads) + '_threads', '') utils.check_create_directory(threads_dir) pool = multiprocessing.Pool(processes=number_process[threads]) for sample in samples_each_threads[threads]: pool.apply_async(downloadAndINNUca, args=( threads_dir, sample, asperaKey, threads, )) pool.close() pool.join() removeFiles(threads_dir, '.log') removeFiles(threads_dir, 'getSeqENA.samples_with_problems.txt') removeFiles(threads_dir, '.cpu.txt') samples_directories = [ d for d in os.listdir(threads_dir) if not d.startswith('.') and os.path.isdir(os.path.join(threads_dir, d, '')) ] for sample_dir in samples_directories: sample_dir_path = os.path.join(threads_dir, sample_dir, '') files = [ f for f in os.listdir(sample_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(sample_dir_path, f)) ] for file_found in files: file_path = os.path.join(sample_dir_path, file_found) if file_found == sample_dir + '_run_successfully.pkl': sample_run_successfully = utils.extractVariableFromPickle( file_path) if not sample_run_successfully: writer_success.write(sample_dir + '\t' + threads_dir + '\n') else: run_successfully += 1 os.remove(file_path) elif file_found == sample_dir + '_downloadAndINNUca_time.pkl': time_taken = utils.extractVariableFromPickle( file_path) writer_times.write(sample_dir + '\t' + threads_dir + '\t' + str(time_taken) + '\n') os.remove(file_path) time_taken = utils.runTime(start_time) del time_taken if run_successfully == 0: sys.exit('No RunIDs were successfully run!') else: print str(run_successfully) + ' samples out of ' + str( len(listRunIDs)) + ' run successfully'
def gather_gene_data_together(data_directory, sequences_information): run_successfully = True counter = 0 sample_data = {} genes_directories = [ d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, '')) ] for gene_dir in genes_directories: gene_dir_path = os.path.join(data_directory, gene_dir, '') files = [ f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f)) ] for file_found in files: if file_found.startswith('coverage_info.') and file_found.endswith( '.pkl'): file_path = os.path.join(gene_dir_path, file_found) if run_successfully: run_successfully, sequence_counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage = utils.extractVariableFromPickle( file_path) sample_data[sequence_counter] = { 'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - percentage_absent, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage } counter += 1 utils.removeDirectory(gene_dir_path) if counter != len(sequences_information): run_successfully = False return run_successfully, sample_data