def compress_decompress(compressed_file, decompressed_file, compressed_True): run_successfully = False malformated_fastq = False length_sequence = None compression_type = None if not compressed_True: compression_type = utils.compressionType(compressed_file) if compression_type is not None or compressed_True: command = ['', '', '--stdout', '--keep', '', '>', ''] if not compressed_True: command[0] = compression_type[0] command[1] = '--decompress' command[4] = compressed_file command[6] = decompressed_file else: command[0] = 'gzip' command[4] = decompressed_file command[6] = compressed_file run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) if run_successfully and not compressed_True: malformated_fastq, length_sequence = check_uncompression_fastq(decompressed_file) elif compression_type is None and not compressed_True: run_successfully = True malformated_fastq, length_sequence = check_uncompression_fastq(compressed_file) decompressed_file = compressed_file if malformated_fastq: run_successfully = False utils.saveVariableToPickle([run_successfully, compressed_file if compressed_True else decompressed_file, length_sequence], os.path.dirname(decompressed_file), os.path.splitext(os.path.basename(decompressed_file))[0])
def countSequencedBases(fastq_file, outdir): run_successfully = False bases = None # Determine compression type compression_type = utils.compressionType(fastq_file) if compression_type is not None: command = [ compression_type[1], '--keep', '--stdout', fastq_file, '|', 'grep', '--after-context=1', '"@"', '|', 'grep', '--invert-match', '"^--$"', '|', 'grep', '--invert-match', '"@"', '|', 'wc', '' ] # Number of characters command[18] = '--chars' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if run_successfully: bases = int(stdout.splitlines()[0]) # Number of lines command[18] = '--lines' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if run_successfully: lines = int(stdout.splitlines()[0]) bases = bases - lines utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file)))
def fastQintegrity(fastq, outdir): run_successfully = False temporary_output_file = os.path.join( outdir, os.path.splitext(os.path.basename(fastq))[0]) compression_type = utils.compressionType(fastq) encoding = None if compression_type is not None: command = [ compression_type[1], '--stdout', '--keep', fastq, '>', temporary_output_file ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if run_successfully: encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread( temporary_output_file, None, outdir) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) utils.saveVariableToPickle( [run_successfully, encoding, min_reads_length, max_reads_length], outdir, os.path.basename(fastq))
def get_sequence_coverage(alignment_file, sequence_to_analyse, outdir, counter): position = 0 coverage = 0 run_successfully, genome_coverage_data_file = compute_genome_coverage_data( alignment_file, sequence_to_analyse, outdir, counter) if run_successfully: problems_found, position, coverage = calculate_genome_coverage( genome_coverage_data_file) if not problems_found and position == 0: # Assuming SPAdes headers renamed (with sample name at the beginning) position = int(sequence_to_analyse.rsplit('_', 6)[4]) else: problems_found = True try: os.remove(genome_coverage_data_file) except Exception as e: print(e) utils.saveVariableToPickle([ sequence_to_analyse, run_successfully, problems_found, position, coverage ], outdir, str('coverage.sequence_' + str(counter)))
def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): multiple_alleles_found = percentage_absent = percentage_lowCoverage = meanCoverage = None # Create vcf file (for multiple alleles check) run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) if run_successfully: # Create coverage tab file run_successfully, gene_coverage = assembly_mapping.compute_genome_coverage_data( bam_file, sequence_information['header'], outdir, counter) if run_successfully: variants = get_variants(gene_vcf) coverage = get_coverage(gene_coverage) multiple_alleles_found = find_multiple_alleles( variants, sequence_information['length'], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, length_extra_seq) percentage_absent, percentage_lowCoverage, meanCoverage = get_coverage_report( coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq) utils.saveVariableToPickle([ run_successfully, counter, multiple_alleles_found, percentage_absent, percentage_lowCoverage, meanCoverage ], outdir, str('coverage_info.' + str(counter)))
def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id): command = ['prefetch', '', ena_id] if aspera_key is not None: _, ascp, _ = utils.runCommandPopenCommunicate(['which', 'ascp'], False, None, False) command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key) run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) if run_successfully: _, prefetch_outdir, _ = utils.runCommandPopenCommunicate(['echo', '$HOME/ncbi/public/sra'], True, None, False) try: os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), os.path.join(outdir, ena_id + '.sra')) except OSError as e: print('Found the following error:' '{}'.format(e)) from shutil import copy as shutil_copy shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), os.path.join(outdir, ena_id + '.sra')) os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra')) utils.saveVariableToPickle(run_successfully, outdir, pickle_prefix + '.' + ena_id)
def gzip_files(file_2_compress, pickle_prefix, outdir): if file_2_compress.endswith('.temp'): out_file = os.path.splitext(file_2_compress)[0] else: out_file = file_2_compress command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) if run_successfully: os.remove(file_2_compress) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id): command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir] if not sra: command[4] = '-P33001' command[7] = str('era-fasp@' + aspera_file_path) pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1] else: command[7] = '[email protected]:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( a=ena_id[:3], b=ena_id[:6], c=ena_id) pickle = pickle_prefix + '.' + ena_id run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, pickle)
def downloadAndINNUca(outdir, run_ID, asperaKey, threads): start_time = time.time() temp_file = os.path.join(outdir, run_ID + '.temp.runID_fileList.txt') with open(temp_file, 'wt') as writer: writer.write(run_ID + '\n') command = [ 'getSeqENA.py', '-l', temp_file, '-o', outdir, '-a', asperaKey, '--downloadLibrariesType', 'PE' ] getSeqENA_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None) os.remove(temp_file) sample_directory = os.path.join(outdir, run_ID, '') innuca_run_successfully = False if getSeqENA_run_successfully: command = [ 'INNUca.py', '-i', sample_directory, '-s', '"Campylobacter jejuni"', '-g', '1.6', '-o', sample_directory, '-j', str(threads), '--jarMaxMemory', 'auto' ] innuca_run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None) innuca_dir = os.path.join(sample_directory, run_ID, '') files = [ f for f in os.listdir(innuca_dir) if not f.startswith('.') and os.path.isfile(os.path.join(innuca_dir, f)) ] for file_innuca in files: shutil.move(os.path.join(innuca_dir, file_innuca), os.path.join(sample_directory, file_innuca)) utils.removeDirectory(innuca_dir) removeFiles(sample_directory, '.gz') removeFiles(sample_directory, '.log') removeFiles(sample_directory, '.cpu.txt') if innuca_run_successfully: time_taken = utils.runTime(start_time) utils.saveVariableToPickle(time_taken, sample_directory, run_ID + '_downloadAndINNUca_time') utils.saveVariableToPickle(innuca_run_successfully, sample_directory, run_ID + '_run_successfully')
def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id): command = ['wget', '--tries=1', '', '-O', ''] if not sra: command[2] = ftp_file_path file_download = ftp_file_path.rsplit('/', 1)[1] command[4] = os.path.join(outdir, file_download) pickle = pickle_prefix + '.' + file_download else: command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( a=ena_id[:3], b=ena_id[:6], c=ena_id) command[4] = os.path.join(outdir, ena_id + '.sra') pickle = pickle_prefix + '.' + ena_id run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, pickle)
def guess_encoding(fastq, number_reads_access_None_all, outdir): gmin, gmax = 99, 0 valid_encodings = None reads_length = [] with open(fastq, 'rtU') as reader: for i, line in enumerate(reader): if number_reads_access_None_all is None or (i + 1) / 4 <= number_reads_access_None_all: if (i + 1) % 4 == 0: if len(line) > 0: reads_length.append(len(line.splitlines()[0])) lmin, lmax = get_qual_range(line.splitlines()[0]) if lmin < gmin or lmax > gmax: gmin, gmax = min(lmin, gmin), max(lmax, gmax) valid_encodings = get_encodings_in_range(gmin, gmax) utils.saveVariableToPickle([fastq, valid_encodings, min(reads_length) if len(reads_length) > 0 else None, max(reads_length) if len(reads_length) > 0 else None], outdir, 'encoding' + '.' + os.path.splitext(os.path.basename(fastq))[0])
def downloadWithSRAprefetch(asperaKey, outdir, pickle_prefix, ena_id): command = ['prefetch', '', ena_id] if asperaKey is not None: ignore, ascp, ignore = utils.runCommandPopenCommunicate( ['which', 'ascp'], False, None, False) command[1] = '-a {ascp}|{asperaKey}'.format(ascp=ascp.splitlines()[0], asperaKey=asperaKey) run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, 3600, True) if run_successfully: ignore, prefetch_outdir, ignore = utils.runCommandPopenCommunicate( ['echo', '$HOME/ncbi/public/sra'], True, None, False) os.rename( os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), os.path.join(outdir, ena_id + '.sra')) utils.saveVariableToPickle(run_successfully, outdir, pickle_prefix + '.' + ena_id)
def fastQintegrity(fastq, outdir): run_successfully = False temporary_output_file = os.path.join(outdir, os.path.splitext(os.path.basename(fastq))[0]) compression_type = utils.compressionType(fastq) encoding, min_reads_length, max_reads_length = None, None, None if compression_type is not None: command = [compression_type[1], '--stdout', '--keep', fastq, '>', temporary_output_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread(temporary_output_file, None, outdir) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) utils.saveVariableToPickle([run_successfully, encoding, min_reads_length, max_reads_length], outdir, os.path.basename(fastq))
def get_sequence_coverage(alignment_file, sequence_to_analyse, outdir, counter): problems_found = False position = 0 coverage = 0 run_successfully, genome_coverage_data_file = compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter) if run_successfully: problems_found, position, coverage = calculate_genome_coverage(genome_coverage_data_file) if not problems_found and position == 0: # Assuming SPAdes headers renamed (with sample name at the beginning) position = int(sequence_to_analyse.rsplit('_', 6)[4]) else: problems_found = True try: os.remove(genome_coverage_data_file) except Exception as e: print e utils.saveVariableToPickle([sequence_to_analyse, run_successfully, problems_found, position, coverage], outdir, str('coverage.sequence_' + str(counter)))
def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): percentage_absent = None percentage_lowCoverage = None meanCoverage = None number_diferences = 0 # Create vcf file (for multiple alleles check) run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) if run_successfully: # Create coverage tab file run_successfully, gene_coverage = compute_genome_coverage_data( bam_file, sequence_information['header'], outdir, counter) if run_successfully: variants = get_variants(gene_vcf) coverage = get_coverage(gene_coverage) run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence( outdir, sequence_information['header'], reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence_information['sequence'], length_extra_seq) percentage_absent, percentage_lowCoverage, meanCoverage = get_coverage_report( coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq) utils.saveVariableToPickle([ run_successfully, counter, number_multi_alleles, percentage_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences ], outdir, str('coverage_info.' + str(counter)))
def countSequencedBases(fastq_file, outdir): run_successfully = False bases = None # Determine compression type compression_type = utils.compressionType(fastq_file) if compression_type is not None: command = [compression_type[1], '--keep', '--stdout', fastq_file, '|', 'grep', '--after-context=1', '"@"', '|', 'grep', '--invert-match', '"^--$"', '|', 'grep', '--invert-match', '"@"', '|', 'wc', ''] # Number of characters command[18] = '--chars' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: bases = int(stdout.splitlines()[0]) # Number of lines command[18] = '--lines' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: lines = int(stdout.splitlines()[0]) bases = bases - lines utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file)))
def fastQintegrity(fastq, outdir): run_successfully = False temporary_output_file = os.path.join( outdir, os.path.splitext(os.path.basename(fastq))[0]) command = ['', '--stdout', '--keep', fastq, '>', temporary_output_file] filetype = utils.compressionType(fastq) if filetype == 'gz': command[0] = 'gunzip' elif filetype == 'bz2': command[0] = 'bunzip2' if command[0] != '': run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, True, None, False) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) utils.saveVariableToPickle(run_successfully, outdir, os.path.basename(fastq))
def downloadWithFtp(ftp_file_path, outdir, pickle_prefix): file_download = ftp_file_path.rsplit('/', 1)[1] command = ['wget', ftp_file_path, '-O', os.path.join(outdir, file_download)] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download))
def downloadWithAspera(aspera_file_path, asperaKey, outdir, pickle_prefix): command = ['ascp', '-QT', '-l', '300m', '-i', asperaKey, str('era-fasp@' + aspera_file_path), outdir] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]))