def merge_snps_files(directory): """This function merge snps files from a single directory""" return_code = 0 all_vcf_files = glob(os.path.join(directory, '*_dir', '*_phased.vcf')) output_file_body = os.path.join( directory, '%s_snps_files.vcf.body' % len(all_vcf_files)) output_file_body = concatenate_file(all_vcf_files, output_file_body, filter="^#") if output_file_body: return_code = 0 output_file_header = os.path.join( directory, '%s_snps_files.vcf.header' % len(all_vcf_files)) command = 'grep "^#" %s > %s ' % (all_vcf_files[0], output_file_header) if return_code == 0: return_code = command_runner.run_command(command) output_file = os.path.join(directory, '%s_phased_snps_files.vcf' % len(all_vcf_files)) command = 'cat %s %s > %s ' % (output_file_header, output_file_body, output_file) if return_code == 0: return_code = command_runner.run_command(command) command = 'rm %s %s' % (output_file_header, output_file_body) if return_code == 0: return_code = command_runner.run_command(command) return return_code
def prepare_genome(genome_file,color_space=False): run_fine=True pipeline_param=utils_param.get_pipeline_parameters() BWA_dir=pipeline_param.get_bwa_dir() BWA_bin=os.path.join(BWA_dir,'bwa') genome_loader = GenomeLoader(genome_file=genome_file) length=0 for fasta_rec in genome_loader: header, sequence = fasta_rec length+=len(sequence) if length>1000000000: break genome_loader.close() #Following recommendation set the indexing algorithm to is if genome is <10M if length>1000000000: a_option='bwtsw' else: a_option='is' #Create the indexes if color_space: command='%s index -c -a %s %s'%(BWA_bin, a_option, genome_file) else: command='%s index -a %s %s'%(BWA_bin, a_option, genome_file) command_runner.run_command(command) return run_fine
def run_assembly(assembly_function, fastq_file, output_dir=None, estimated_size=600, subsample_nb_read=None, rg_ids=[], name=None, adapter_file=None): if name is None: name,ext =os.path.splitext(os.path.basename(fastq_file)) current_dir=None if output_dir and os.path.exists(output_dir): logging.debug('change directory to %s'%output_dir) current_dir=os.getcwd() os.chdir(output_dir) fastq_file = clean_fastq(fastq_file, adapter_file=adapter_file, rg_ids=rg_ids, subsample_nb_read=subsample_nb_read) contig_file = assembly_function(fastq_file, estimated_size=estimated_size) if contig_file: contig_file = os.path.abspath(contig_file) merged_consensus = os.path.join(os.path.dirname(contig_file),'merged_consensus.fa') if os.path.exists(merged_consensus): logging.debug('remove the merged_consensus.fa that already exists before assembling') command = 'rm -f %s'%(merged_consensus) command_runner.run_command(command) if current_dir: logging.debug('change directory back to %s'%current_dir) os.chdir(current_dir) nb_seq=max_len=0 corrected_contig_file=None if contig_file: corrected_contig_file, nb_seq, max_len = correct_contig_file(contig_file, name) return (corrected_contig_file,nb_seq, max_len)
def align_short_reads_se(fastq_file1, genome_file, output_dir, sample_name, thread, BWA_bin, samtools_bin, picard_dir, read_group_command, files_and_dir, illumina, fifo): fastq_name, ext = os.path.splitext(os.path.basename(fastq_file1)) sai_file1 = '%s.sai' % os.path.join(output_dir, fastq_name) illumina_str = "" if illumina: illumina_str = " -I " command = '%s aln %s -t %s %s %s > %s' % ( BWA_bin, illumina_str, thread, genome_file, fastq_file1, sai_file1) return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sai_file1) #only one end so just run get the sorted bam file bam_file = os.path.join(output_dir, sample_name + "_sorted") command = """%s samse %s %s %s %s | %s view -bS - | %s sort - %s""" % ( BWA_bin, read_group_command, genome_file, sai_file1, fastq_file1, samtools_bin, samtools_bin, bam_file) return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False return bam_file
def run_smalt_paired(consensus_file, read1_fastq, read2_fastq, **kwarg): index1 = '%s.sma' % consensus_file command = 'rm -rf %s' % index1 if os.path.exists(index1): return_code = command_runner.run_command(command) index2 = '%s.smi' % consensus_file command = 'rm -rf %s' % index2 if os.path.exists(index2): return_code = command_runner.run_command(command) index3 = '%s.fai' % consensus_file command = 'rm -rf %s' % index3 if os.path.exists(index3): return_code = command_runner.run_command(command) command = "smalt index %s %s" % (consensus_file, consensus_file) return_code = command_runner.run_command(command) name = longest_common_substr_from_start(read1_fastq, read2_fastq).rstrip('_') read2_fastq_rev_comp = reverse_complement(read2_fastq) sam_file = name + '.sam' command = "smalt map -f samsoft -o %s %s %s %s" % ( sam_file, consensus_file, read1_fastq, read2_fastq_rev_comp) return_code = command_runner.run_command(command) return sam_file
def prepare_genome(genome_file, color_space=False): run_fine = True pipeline_param = utils_param.get_pipeline_parameters() BWA_dir = pipeline_param.get_bwa_dir() BWA_bin = os.path.join(BWA_dir, 'bwa') genome_loader = GenomeLoader(genome_file=genome_file) length = 0 for fasta_rec in genome_loader: header, sequence = fasta_rec length += len(sequence) if length > 1000000000: break genome_loader.close() #Following recommendation set the indexing algorithm to is if genome is <10M if length > 1000000000: a_option = 'bwtsw' else: a_option = 'is' #Create the indexes if color_space: command = '%s index -c -a %s %s' % (BWA_bin, a_option, genome_file) else: command = '%s index -a %s %s' % (BWA_bin, a_option, genome_file) command_runner.run_command(command) return run_fine
def align_bwa_long(fastq_file1, fastq_file2, genome_file, sample_name, read_group, analysis_type, output_dir, BWA_bin, samtools_bin, picard_dir, thread, sort, illumina, files_and_dir): if illumina: logging.error("long read alignment do not support illumina format") return False if analysis_type is not None: logging.error("long read alignment do not support %s analsyis"%(analysis_type)) return False run_fine=True tmp_bam_file=os.path.join(output_dir, sample_name+'_tmp.bam') files_and_dir.append(tmp_bam_file) if fastq_file2: command = '%s mem -t %s %s %s %s | %s view -bS - > %s'%(BWA_bin, thread, genome_file, fastq_file1, fastq_file2, samtools_bin, tmp_bam_file) else: command = '%s bwasw -t %s %s %s | %s view -bS - > %s'%(BWA_bin, thread, genome_file, fastq_file1, samtools_bin, tmp_bam_file) if run_fine: return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False if sort: bam_file=os.path.join(output_dir, sample_name+'_sorted.bam') sort_order='coordinate' else: bam_file=os.path.join(output_dir, sample_name+'.bam') sort_order='queryname' #bwa screw up the mate information fixmate_jar = os.path.join(picard_dir, 'FixMateInformation.jar') fixed_bam_file=os.path.join(output_dir, sample_name+'_fixed.bam') command = 'java -jar -Xmx2G %s I=%s O=%s SO=%s VALIDATION_STRINGENCY=LENIENT'%(fixmate_jar, tmp_bam_file, fixed_bam_file, sort_order) if run_fine: return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False if read_group: files_and_dir.append(fixed_bam_file) read_group_param=[] read_group_elements = extract_read_group(read_group) replace_readgroup_jar = os.path.join(picard_dir, 'AddOrReplaceReadGroups.jar') for key in ['ID', 'LB', 'PL', 'PU', 'SM', 'CN']: if read_group_elements.has_key(key): read_group_param.append('%s="%s"'%(key, read_group_elements.get(key))) else: read_group_param.append('%s=0'%(key)) command = 'java -jar -Xmx2G %s I=%s O=%s SO=%s %s VALIDATION_STRINGENCY=LENIENT'%(replace_readgroup_jar, fixed_bam_file, bam_file, sort_order, ' '.join(read_group_param)) if run_fine: return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False else: bam_file=os.path.join(output_dir, sample_name+'.bam') command='mv %s %s'%(fixed_bam_file, bam_file) if run_fine: return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False if run_fine: return bam_file else: return False
def run_velvet(fastq_file_name, kmer_length=29, output_dir= 'velvet', **kwarg): log_file='%s.log'%(output_dir) command = "%s %s %s -fastq -short %s 2>&1 >%s"%(velveth_bin, output_dir, kmer_length, fastq_file_name, log_file) return_code = command_runner.run_command(command) command = "%s %s 2>&1 >%s"%(velvetg_bin, output_dir, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa'%output_dir) contig_file_name=None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name;
def SNP_call_with_samtools(samtools_dir, name, bam_file, ref_file): samtools_bin = os.path.join(samtools_dir, "samtools") bcftools_bin = os.path.join(samtools_dir, "bcftools/bcftools") if not os.path.exists(bcftools_bin): bcftools_bin = os.path.join(samtools_dir, "bcftools") samtools_raw_vcf = os.path.join(name + '_sorted_mrk_dup_fixed_samtools.vcf') command = "%s mpileup -d 50000 -ADESuf %s %s | %s view -gv - > %s" command = command % (samtools_bin, ref_file, bam_file, bcftools_bin, samtools_raw_vcf) command_runner.run_command(command) samtools_raw_filtered = os.path.join(name + '_sorted_mrk_dup_fixed_samtools_filterd20q60.vcf') command = "vcfutils.pl varFilter -d 20 %s | awk '{if (/^#/ || $6>60){print}}' > %s" % ( samtools_raw_vcf, samtools_raw_filtered) command_runner.run_command(command)
def run_velvet(fastq_file_name, kmer_length=29, output_dir='velvet', **kwarg): log_file = '%s.log' % (output_dir) command = "%s %s %s -fastq -short %s 2>&1 >%s" % ( velveth_bin, output_dir, kmer_length, fastq_file_name, log_file) return_code = command_runner.run_command(command) command = "%s %s 2>&1 >%s" % (velvetg_bin, output_dir, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa' % output_dir) contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def run_smalt_single(consensus_file, read1_fastq, **kwarg): name, ext = os.path.splitext(read1_fastq) sam_file = name + '_single.sam' command = "smalt map -f samsoft -o %s %s %s" % (sam_file, consensus_file, read1_fastq) return_code = command_runner.run_command(command) return sam_file
def build_jerry(repo, coverage): build_command = [ 'tools/build.py', '--clean', '--debug', '--compile-flag=-fsanitize=address', '--compile-flag=-m32', '--compile-flag=-fno-omit-frame-pointer', '--compile-flag=-fno-common', '--compile-flag=-g', '--strip=off', '--system-allocator=on', '--logging=on', '--linker-flag=-fuse-ld=gold', '--error-messages=on', '--profile=es2015-subset', ] if coverage: coverage_specs = [ '--compile-flag=-fprofile-arcs', '--compile-flag=-ftest-coverage', '--link-lib', 'gcov' ] build_command += coverage_specs if run_command(build_command, cwd=repo, debug=True): raise Exception(f'{build_command} failed!')
def run_velvetOptimiser(fastq_file_name, low_k=59, high_k=99, outputdir='velvetopt', **kwarg): command='rm -rf %s'%outputdir if os.path.exists(outputdir): return_code = command_runner.run_command(command) log_file='%s.log'%outputdir command_tmp = "%s -f '-fastq -short %s' --s %s --e %s --k max --c max --d %s 2>&1 >%s" command = command_tmp % (velvetOptimiser_bin, fastq_file_name, low_k, high_k, outputdir, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa'%outputdir) # If only one contig file exists, as it should if VelvetOptimiser runs # successfully, write out the assembled contig(s) contig_file_name=None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name;
def run_cap3(fastq_file_name, output_dir="cap3", **kwarg): log_file = '%s.log' % output_dir fasta_file = os.path.join(output_dir, os.path.basename(fastq_file_name) + '.fa') command = 'mkdir %s' % output_dir if not os.path.exists(output_dir): return_code = command_runner.run_command(command) command = "seqtk seq -A %s > %s" % (fastq_file_name, fasta_file) return_code = command_runner.run_command(command) command = "%s %s 2>&1 >%s" % (cap3_bin, fasta_file, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s.cap.contigs' % (fasta_file)) contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def run_all_fastq_files(directory): directory=os.path.abspath(directory) all_dirs = glob(os.path.join(directory,'*_dir')) all_samples=set() for sub_dir in all_dirs: print sub_dir name=os.path.basename(sub_dir)[:-len("_dir")] samples=calculate_base_frequency_for_snps(sub_dir, name) all_samples.update(set(samples)) for sample in all_samples: #concatenate the allele frequency file per samples merged_file = os.path.join(directory,'samtools_snps_%s.allelefreq'%sample) command = 'cat %s/*_dir/*_%s.allelefreq > %s'%(directory, sample, merged_file) command_runner.run_command(command) return
def concatenate_file(list_of_file,output_file=None, **kwargs): """This is a generic merging function for concatenating text files. It can take a filter keyword argument to grep out using the provided value""" if not output_file: #Create a generic name and put it in the current working directory if kwargs.has_key('output_dir'): working_directory = kwargs.get('output_dir') else: working_directory = os.getcwd() i=1 output_file_template=os.path.join(working_directory,'tmp_concatenate_%s') output_file=output_file_template%i while os.path.exists(output_file): i+=1 output_file=output_file_template%i if kwargs.has_key('filter'): filter_on = kwargs.get('filter') command = 'cat %s | egrep -v %s > %s '%(' '.join(list_of_file), filter_on, output_file) else: command = 'cat %s > %s '%(' '.join(list_of_file), output_file) return_code=command_runner.run_command(command) if return_code==0: return output_file else: return None
def run_blast(contig_file, genome_file): blastn_plus_bin='/ifs/software/linux_x86_64/blast+/current/bin/blastn' output_file='%s.blast6out'%contig_file command='%s -query %s -db %s -max_target_seqs 1 -outfmt 6 -out %s'%(blastn_plus_bin, contig_file, genome_file, output_file) return_code = command_runner.run_command(command) if return_code!=0: return None return output_file
def clone_jerry(repo): command = [ 'git', 'clone', 'https://github.com/jerryscript-project/jerryscript.git', repo ] if run_command(command, cwd=ROOT_DIR, debug=True): raise Exception(f'{command} failed!')
def run_clc_assemble(fastq_file_name, word_size=None, output_dir='clc_bio', **kwarg): log_file='%s.log'%output_dir command='mkdir %s'%output_dir if not os.path.exists(output_dir): return_code = command_runner.run_command(command) if word_size: command = "%s -v -w %s -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % ( clc_novo_bin, word_size, fastq_file_name, log_file) else: command = "%s -v -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % ( clc_novo_bin, fastq_file_name, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa'%output_dir) contig_file_name=None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name;
def SNP_call_with_samtools(samtools_dir, name, bam_file, ref_file): samtools_bin = os.path.join(samtools_dir, "samtools") bcftools_bin = os.path.join(samtools_dir, "bcftools/bcftools") if not os.path.exists(bcftools_bin): bcftools_bin = os.path.join(samtools_dir, "bcftools") samtools_raw_vcf = os.path.join(name + '_sorted_mrk_dup_fixed_samtools.vcf') command = "%s mpileup -d 50000 -ADESuf %s %s | %s view -gv - > %s" command = command % (samtools_bin, ref_file, bam_file, bcftools_bin, samtools_raw_vcf) command_runner.run_command(command) samtools_raw_filtered = os.path.join( name + '_sorted_mrk_dup_fixed_samtools_filterd20q60.vcf') command = "vcfutils.pl varFilter -d 20 %s | awk '{if (/^#/ || $6>60){print}}' > %s" % ( samtools_raw_vcf, samtools_raw_filtered) command_runner.run_command(command)
def run_blast(contig_file, genome_file): blastn_plus_bin = '/ifs/software/linux_x86_64/blast+/current/bin/blastn' output_file = '%s.blast6out' % contig_file command = '%s -query %s -db %s -max_target_seqs 1 -outfmt 6 -out %s' % ( blastn_plus_bin, contig_file, genome_file, output_file) return_code = command_runner.run_command(command) if return_code != 0: return None return output_file
def apply_patch(repo, patch_file): if not isfile(patch_file): raise Exception('Cannot find hash file in the given directory.') patch_file = abspath(patch_file) command = ['git', 'apply', patch_file] if run_command(command, cwd=repo, debug=True): raise Exception(f'{command} failed!')
def trim_fastq_to_length(fastq_file, output_file, length): #output_file = fastq_file+'trim%s'%length opener='cat' if fastq_file.endswith('.gz'): opener='zcat' command = '''%s %s | awk '{if (NR%4==2 || NR%4==0){print substr($0, 1,%s)}else{print $0}}' > %s'''%(opener, fastq_file, length, output_file) return_code = command_runner.run_command(command) return return_code
def run_all_fastq_files(directory): directory = os.path.abspath(directory) all_dirs = glob(os.path.join(directory, '*_dir')) all_samples = set() for sub_dir in all_dirs: print sub_dir name = os.path.basename(sub_dir)[:-len("_dir")] samples = calculate_base_frequency_for_snps(sub_dir, name) all_samples.update(set(samples)) for sample in all_samples: #concatenate the allele frequency file per samples merged_file = os.path.join(directory, 'samtools_snps_%s.allelefreq' % sample) command = 'cat %s/*_dir/*_%s.allelefreq > %s' % (directory, sample, merged_file) command_runner.run_command(command) return
def run_soapdenovo(fastq_file_name, max_read_len=101, **kwarg): log_file='soapdenovo.log' command='mkdir soapdenovo' if not os.path.exists('soapdenovo'): return_code = command_runner.run_command(command) config_file='soapdenovo/config_file' open_file=open(config_file,'w') open_file.write("max_rd_len=%s\n[LIB]\nq=%s\n"%(max_read_len,fastq_file_name)) open_file.close() command='%s pregraph -K 29 -s %s -o soapdenovo/graph -p 1 2>&1 >%s'%(SOAPdenovo_bin, config_file,log_file) return_code = command_runner.run_command(command) command='%s contig -g soapdenovo/graph 2>&1 >>%s'%(SOAPdenovo_bin,log_file) return_code = command_runner.run_command(command) contig_files = glob('soapdenovo/graph.contig') contig_file_name=None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def run_idba(fastq_file_name, max_read_len=200, **kwarg): log_file='idba_ud.log' command="%s -r %s -o idba_ud --min_contig %s --num_threads 1 --mink 40 --min_count 8 --min_support 4 2>&1 >%s"%(idba_ud_bin,fastq_file_name, max_read_len,log_file) return_code = command_runner.run_command(command) contig_files = glob('idba_ud/contig.fa') contig_file_name=None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def fastq_2_bam(fastq_file, rgid, qual, files_and_dir, fifo): fastqToSam_jar='' sam_file='' run_fine=True command="java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s"%(fastqToSam_jar,fastq_file,sam_file,rgid,rgid,rgid,qual) if fifo: command+=" &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sam_file) sam_file2_tmp='%s.sam.tmp'%os.path.join(output_dir, fastq_name) if fifo: if os.path.exists(sam_file2_tmp): os.remove(sam_file2_tmp) command="mkfifo %s"%sam_file2_tmp return_code = command_runner.run_command( command) command = fix_read_name_in_sam_command(129, header=False) command+=""" %s > %s"""%(sam_file2,sam_file2_tmp)
def run_idba(fastq_file_name, max_read_len=200, **kwarg): log_file = 'idba_ud.log' command = "%s -r %s -o idba_ud --min_contig %s --num_threads 1 --mink 40 --min_count 8 --min_support 4 2>&1 >%s" % ( idba_ud_bin, fastq_file_name, max_read_len, log_file) return_code = command_runner.run_command(command) contig_files = glob('idba_ud/contig.fa') contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def run_flash(output_dir, fastq_1, fastq_2, overlap): command="flash -m %s -d %s %s %s"%(overlap, output_dir,fastq_1,fastq_2) return_code = command_runner.run_command(command) if return_code !=0: return None out_extended=os.path.join(output_dir,"out.extendedFrags.fastq") #if Flash finishes succesfully but nothing was merged if os.stat(out_extended).st_size == 0: return None return out_extended
def trim_fastq_to_length(fastq_file, output_file, length): #output_file = fastq_file+'trim%s'%length opener = 'cat' if fastq_file.endswith('.gz'): opener = 'zcat' command = '''%s %s | awk '{if (NR%4==2 || NR%4==0){print substr($0, 1,%s)}else{print $0}}' > %s''' % ( opener, fastq_file, length, output_file) return_code = command_runner.run_command(command) return return_code
def run_one_fastq_file(fastq_file, output_dir, assembly_function_list, estimated_size=600, subsample_nb_read=None, rg_ids=[], read1_fasta=None, name=None, force_merge=False, adapter_file=None): fastq_file = os.path.abspath(fastq_file) #output_dir='%s_dir'%fastq_file if not os.path.exists(output_dir): command = 'mkdir %s' % (output_dir) return_code = command_runner.run_command(command) for assembly_function in assembly_function_list: #Assemble with provided assembler (contig_file, nb_seq, max_len) = run_assembly(assembly_function, fastq_file, output_dir, estimated_size=estimated_size, subsample_nb_read=subsample_nb_read, rg_ids=rg_ids, name=name, adapter_file=adapter_file) #Merge read one and read2 contig if contig_file: #TODO: This function gets run twice need to change that as the second run is not useful merge_read1_and_read2_contigs( name, read1_contig=read1_fasta, read2_contigs=contig_file, output_dir=os.path.dirname(contig_file)) best_assembler_name, best_assembly_file = get_best_assembly_merged( output_dir, read1_fasta, name, force_merge) command = "cp %s %s" % (best_assembly_file, os.path.join(output_dir, "best_assembly.fa")) return_code = command_runner.run_command(command) return os.path.join(output_dir, "best_assembly.fa")
def create_sequence_dictionary(picard_dir, genome_file): """Create a sequence dictionary from the genome file provided""" name, dummy=os.path.splitext(genome_file) genome_dict=name+'.dict' if not os.path.exists(genome_dict) or os.path.getmtime(genome_file) > os.path.getmtime(genome_dict): CreateSequenceDictionary_jar=os.path.join(picard_dir,'CreateSequenceDictionary.jar') command='java -jar %s REFERENCE=%s O=%s'%(CreateSequenceDictionary_jar,genome_file,genome_dict) return_code=command_runner.run_command(command) if return_code!=0: genome_dict=None return genome_dict
def checkout_to_hash(repo, hash_file): if not isfile(hash_file): raise Exception('Cannot find hash file in the given directory.') with open(hash_file, 'r') as hash_f: git_hash = hash_f.read() command = ['git', 'checkout', git_hash] if run_command(command, cwd=repo, debug=True): raise Exception(f'{command} failed!')
def merge_snps_files(directory): """This function merge snps files from a single directory""" return_code = 0 all_vcf_files = glob(os.path.join(directory, '*_dir', '*_phased.vcf')) output_file_body = os.path.join(directory, '%s_snps_files.vcf.body' % len(all_vcf_files)) output_file_body = concatenate_file(all_vcf_files, output_file_body, filter="^#") if output_file_body: return_code = 0 output_file_header = os.path.join(directory, '%s_snps_files.vcf.header' % len(all_vcf_files)) command = 'grep "^#" %s > %s ' % (all_vcf_files[0], output_file_header) if return_code == 0: return_code = command_runner.run_command(command) output_file = os.path.join(directory, '%s_phased_snps_files.vcf' % len(all_vcf_files)) command = 'cat %s %s > %s ' % (output_file_header, output_file_body, output_file) if return_code == 0: return_code = command_runner.run_command(command) command = 'rm %s %s' % (output_file_header, output_file_body) if return_code == 0: return_code = command_runner.run_command(command) return return_code
def fastq_2_bam(fastq_file, rgid, qual, files_and_dir, fifo): fastqToSam_jar = '' sam_file = '' run_fine = True command = "java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s" % ( fastqToSam_jar, fastq_file, sam_file, rgid, rgid, rgid, qual) if fifo: command += " &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sam_file) sam_file2_tmp = '%s.sam.tmp' % os.path.join(output_dir, fastq_name) if fifo: if os.path.exists(sam_file2_tmp): os.remove(sam_file2_tmp) command = "mkfifo %s" % sam_file2_tmp return_code = command_runner.run_command(command) command = fix_read_name_in_sam_command(129, header=False) command += """ %s > %s""" % (sam_file2, sam_file2_tmp)
def run_clc_assemble(fastq_file_name, word_size=None, output_dir='clc_bio', **kwarg): log_file = '%s.log' % output_dir command = 'mkdir %s' % output_dir if not os.path.exists(output_dir): return_code = command_runner.run_command(command) if word_size: command = "%s -v -w %s -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % ( clc_novo_bin, word_size, fastq_file_name, log_file) else: command = "%s -v -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % ( clc_novo_bin, fastq_file_name, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa' % output_dir) contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def merge_all_summary_files_from_directories(directory): """This function will merge the summary files across all the directories""" return_code=0 all_summary_files = glob(os.path.join(directory,'*_dir','*summary_stat.txt')) output_file_body = os.path.join(directory,'all_summary_stat.txt.body') output_file_body = merge_by_chunck(all_summary_files, concatenate_file, output_file_body, filter="^name") if output_file_body: return_code=0 output_file_header = os.path.join(directory,'all_summary_stat.txt.header') command = 'head -n 1 %s > %s '%(all_summary_files[0], output_file_header) if return_code==0: return_code = command_runner.run_command(command) output_file = os.path.join(directory,'all_summary_stat.txt') command = 'cat %s %s > %s '%(output_file_header, output_file_body, output_file) if return_code==0: return_code = command_runner.run_command(command) command = 'rm %s %s'%(output_file_header, output_file_body) if return_code==0: return_code = command_runner.run_command(command) return return_code
def merge_all_snps_files_from_directories(directory): """This function will merge the snps files across all the directories""" return_code=0 all_vcf_files = glob(os.path.join(directory,'*_dir','*_snps_files.vcf')) output_file_body = os.path.join(directory,'all_consensus_snps_files.vcf.body') output_file_body = merge_by_chunck(all_vcf_files, concatenate_file, output_file_body, filter="^#") if output_file_body: return_code=0 output_file_header = os.path.join(directory,'all_consensus_snps_files.vcf.header') command = 'grep "^#" %s > %s '%(all_vcf_files[0], output_file_header) if return_code==0: return_code = command_runner.run_command(command) output_file = os.path.join(directory,'all_consensus_snps_files.vcf') command = 'cat %s %s > %s '%(output_file_header, output_file_body, output_file) if return_code==0: return_code = command_runner.run_command(command) command = 'rm %s %s'%(output_file_header, output_file_body) if return_code==0: return_code = command_runner.run_command(command) return return_code
def run_flash(output_dir, fastq_1, fastq_2, overlap): command = "flash -m %s -d %s %s %s" % (overlap, output_dir, fastq_1, fastq_2) return_code = command_runner.run_command(command) if return_code != 0: return None out_extended = os.path.join(output_dir, "out.extendedFrags.fastq") #if Flash finishes succesfully but nothing was merged if os.stat(out_extended).st_size == 0: return None return out_extended
def run_velvetOptimiser(fastq_file_name, low_k=59, high_k=99, outputdir='velvetopt', **kwarg): command = 'rm -rf %s' % outputdir if os.path.exists(outputdir): return_code = command_runner.run_command(command) log_file = '%s.log' % outputdir command_tmp = "%s -f '-fastq -short %s' --s %s --e %s --k max --c max --d %s 2>&1 >%s" command = command_tmp % (velvetOptimiser_bin, fastq_file_name, low_k, high_k, outputdir, log_file) return_code = command_runner.run_command(command) contig_files = glob('%s/contigs.fa' % outputdir) # If only one contig file exists, as it should if VelvetOptimiser runs # successfully, write out the assembled contig(s) contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def run_soapdenovo(fastq_file_name, max_read_len=101, **kwarg): log_file = 'soapdenovo.log' command = 'mkdir soapdenovo' if not os.path.exists('soapdenovo'): return_code = command_runner.run_command(command) config_file = 'soapdenovo/config_file' open_file = open(config_file, 'w') open_file.write("max_rd_len=%s\n[LIB]\nq=%s\n" % (max_read_len, fastq_file_name)) open_file.close() command = '%s pregraph -K 29 -s %s -o soapdenovo/graph -p 1 2>&1 >%s' % ( SOAPdenovo_bin, config_file, log_file) return_code = command_runner.run_command(command) command = '%s contig -g soapdenovo/graph 2>&1 >>%s' % (SOAPdenovo_bin, log_file) return_code = command_runner.run_command(command) contig_files = glob('soapdenovo/graph.contig') contig_file_name = None if len(contig_files) == 1: contig_file_name = contig_files[0] return contig_file_name
def copy_file_across(source, destination, server_source=None, server_destination=None, overwrite=False): if check_file_or_dir(source,server_source) == 'file' and check_file_or_dir(destination,server_destination) == 'dir': destination_file=os.path.join(destination,os.path.basename(source)) else: destination_file=destination if not checkFile(destination_file, server_destination) or overwrite: if server_source and server_destination: name = os.path.basename(source) tmp_file = '/tmp/%s'%(name) command='scp %s:%s %s'%(server_source, source, tmp_file) command_runner.run_command(command) command='scp %s %s:%s'%(tmp_file, server_destination, destination) command_runner.run_command(command) os.remove(tmp_file) else: if server_source: command='scp %s:%s %s'%(server_source, source, destination) elif server_destination: command='scp %s %s:%s'%(source, server_destination, destination) else: command='scp %s %s'%(source, destination) command_runner.run_command(command) else: if server_destination: logging.warning('%s exist on %s use force to overwrite'%(destination_file, server_destination)) else: logging.warning('%s exist use force to overwrite'%destination_file)
def clean_fastq(fastq_file, adapter_file=None, rg_ids=[], subsample_nb_read=None): if rg_ids: fastq_file = keep_read_from_samples(fastq_file, rg_ids) if adapter_file: adapter_trim = fastq_file + '.adapter_trimmed' if not os.path.exists(adapter_trim): command = "scythe -q sanger -a %s -o %s %s" % ( adapter_file, adapter_trim, fastq_file) command_runner.run_command(command) fastq_file = adapter_trim qual_trim = fastq_file + ".qual_trimmed" if not os.path.exists(qual_trim): command = "sickle se -f %s -t sanger -o %s" % (fastq_file, qual_trim) command_runner.run_command(command) fastq_file = qual_trim if subsample_nb_read: sub_sampled = qual_trim + ".%s" % subsample_nb_read if not os.path.exists(sub_sampled): command = "seqtk sample %s %s > %s" % ( fastq_file, subsample_nb_read, sub_sampled) command_runner.run_command(command) return sub_sampled else: return fastq_file
def run_assembly(assembly_function, fastq_file, output_dir=None, estimated_size=600, subsample_nb_read=None, rg_ids=[], name=None, adapter_file=None): if name is None: name, ext = os.path.splitext(os.path.basename(fastq_file)) current_dir = None if output_dir and os.path.exists(output_dir): logging.debug('change directory to %s' % output_dir) current_dir = os.getcwd() os.chdir(output_dir) fastq_file = clean_fastq(fastq_file, adapter_file=adapter_file, rg_ids=rg_ids, subsample_nb_read=subsample_nb_read) contig_file = assembly_function(fastq_file, estimated_size=estimated_size) if contig_file: contig_file = os.path.abspath(contig_file) merged_consensus = os.path.join(os.path.dirname(contig_file), 'merged_consensus.fa') if os.path.exists(merged_consensus): logging.debug( 'remove the merged_consensus.fa that already exists before assembling' ) command = 'rm -f %s' % (merged_consensus) command_runner.run_command(command) if current_dir: logging.debug('change directory back to %s' % current_dir) os.chdir(current_dir) nb_seq = max_len = 0 corrected_contig_file = None if contig_file: corrected_contig_file, nb_seq, max_len = correct_contig_file( contig_file, name) return (corrected_contig_file, nb_seq, max_len)
def convert_untrimmed_read(fastq_file, output_dir, rgid, libid, smid, picard_dir, files_and_dir, illumina, fifo): fastq_name, ext = os.path.splitext(os.path.basename(fastq_file)) sam_file = '%s.sam' % os.path.join(output_dir, fastq_name) if fifo: command = "mkfifo %s" % sam_file if os.path.exists(sam_file): os.remove(sam_file) return_code = command_runner.run_command(command) fastqToSam_jar = os.path.join(picard_dir, "FastqToSam.jar") if illumina: qual = "Illumina" else: qual = "Standard" command = "java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s" % ( fastqToSam_jar, fastq_file, sam_file, rgid, libid, smid, qual) if fifo: command += " &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sam_file) read_sam_tmp = '%s.tmp' % (sam_file) if fifo: if os.path.exists(read_sam_tmp): os.remove(read_sam_tmp) command = "mkfifo %s" % read_sam_tmp return_code = command_runner.run_command(command) command = fix_read_name_in_sam_command(65, header=False) command += """ %s > %s""" % (sam_file, read_sam_tmp) if fifo: command += " &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(read_sam_tmp) return read_sam_tmp
def run_one_fastq_file(fastq_file, output_dir, assembly_function_list, estimated_size=600, subsample_nb_read=None, rg_ids=[], read1_fasta=None, name=None, force_merge=False, adapter_file=None): fastq_file=os.path.abspath(fastq_file) #output_dir='%s_dir'%fastq_file if not os.path.exists(output_dir): command='mkdir %s'%(output_dir) return_code = command_runner.run_command(command) for assembly_function in assembly_function_list: #Assemble with provided assembler (contig_file, nb_seq, max_len) = run_assembly(assembly_function, fastq_file, output_dir, estimated_size=estimated_size, subsample_nb_read=subsample_nb_read, rg_ids=rg_ids, name=name, adapter_file=adapter_file) #Merge read one and read2 contig if contig_file: #TODO: This function gets run twice need to change that as the second run is not useful merge_read1_and_read2_contigs(name, read1_contig=read1_fasta, read2_contigs=contig_file, output_dir=os.path.dirname(contig_file)) best_assembler_name, best_assembly_file = get_best_assembly_merged(output_dir, read1_fasta, name, force_merge) command="cp %s %s"%(best_assembly_file, os.path.join(output_dir, "best_assembly.fa")) return_code = command_runner.run_command(command) return os.path.join(output_dir, "best_assembly.fa")
def align_short_reads_se(fastq_file1, genome_file, output_dir, sample_name, thread, BWA_bin, samtools_bin, picard_dir, read_group_command, files_and_dir, illumina, fifo): fastq_name, ext=os.path.splitext(os.path.basename(fastq_file1)) sai_file1='%s.sai'%os.path.join(output_dir,fastq_name) illumina_str="" if illumina: illumina_str=" -I " command='%s aln %s -t %s %s %s > %s'%(BWA_bin, illumina_str,thread, genome_file, fastq_file1, sai_file1) return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sai_file1) #only one end so just run get the sorted bam file bam_file=os.path.join(output_dir, sample_name+"_sorted") command="""%s samse %s %s %s %s | %s view -bS - | %s sort - %s"""%(BWA_bin, read_group_command, genome_file, sai_file1, fastq_file1, samtools_bin, samtools_bin, bam_file ) return_code = command_runner.run_command( command) if return_code is not 0: run_fine = False return bam_file
def run_smalt_paired(consensus_file, read1_fastq, read2_fastq, **kwarg): index1 = '%s.sma' % consensus_file command = 'rm -rf %s' % index1 if os.path.exists(index1): return_code = command_runner.run_command(command) index2 = '%s.smi' % consensus_file command = 'rm -rf %s' % index2 if os.path.exists(index2): return_code = command_runner.run_command(command) index3 = '%s.fai' % consensus_file command = 'rm -rf %s' % index3 if os.path.exists(index3): return_code = command_runner.run_command(command) command = "smalt index %s %s" % (consensus_file, consensus_file) return_code = command_runner.run_command(command) name = longest_common_substr_from_start(read1_fastq, read2_fastq).rstrip('_') read2_fastq_rev_comp = reverse_complement(read2_fastq) sam_file = name + '.sam' command = "smalt map -f samsoft -o %s %s %s %s" % (sam_file, consensus_file, read1_fastq, read2_fastq_rev_comp) return_code = command_runner.run_command(command) return sam_file
def create_sequence_dictionary(picard_dir, genome_file): """Create a sequence dictionary from the genome file provided""" name, dummy = os.path.splitext(genome_file) genome_dict = name + '.dict' if not os.path.exists(genome_dict) or os.path.getmtime( genome_file) > os.path.getmtime(genome_dict): CreateSequenceDictionary_jar = os.path.join( picard_dir, 'CreateSequenceDictionary.jar') command = 'java -jar %s REFERENCE=%s O=%s' % ( CreateSequenceDictionary_jar, genome_file, genome_dict) return_code = command_runner.run_command(command) if return_code != 0: genome_dict = None return genome_dict
def convert_untrimmed_read(fastq_file, output_dir, rgid,libid,smid, picard_dir, files_and_dir, illumina, fifo): fastq_name, ext=os.path.splitext(os.path.basename(fastq_file)) sam_file='%s.sam'%os.path.join(output_dir,fastq_name) if fifo: command="mkfifo %s"%sam_file if os.path.exists(sam_file): os.remove(sam_file) return_code = command_runner.run_command(command) fastqToSam_jar=os.path.join(picard_dir,"FastqToSam.jar") if illumina: qual="Illumina" else: qual="Standard" command="java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s"%(fastqToSam_jar,fastq_file,sam_file,rgid,libid,smid,qual) if fifo: command+=" &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(sam_file) read_sam_tmp='%s.tmp'%(sam_file) if fifo: if os.path.exists(read_sam_tmp): os.remove(read_sam_tmp) command="mkfifo %s"%read_sam_tmp return_code = command_runner.run_command( command) command=fix_read_name_in_sam_command(65, header=False) command+=""" %s > %s"""%(sam_file, read_sam_tmp) if fifo: command+=" &" return_code = command_runner.run_command(command) if return_code is not 0: run_fine = False files_and_dir.append(read_sam_tmp) return read_sam_tmp
def sort_bam_file_per_coordinate(picard_dir, input_bam, output_bam, overwrite=False,validation_stringency="LENIENT",**kwargs): return_code=1 if picard_dir: options=[] for key in kwargs.keys(): options.append("%s=%s"%(key,kwargs.get(key))) sort_jar=os.path.join(picard_dir,'SortSam.jar') command="java -Xmx4G -jar %s I=%s O=%s SO=coordinate VALIDATION_STRINGENCY=%s %s"%(sort_jar, input_bam, output_bam, validation_stringency, ' '.join(options)) if (not os.path.exists(output_bam)) or overwrite: return_code = command_runner.run_command(command) else: logging.warning('The file %s exists, use overwrite option to overwrite if applicable.'%output_bam) return return_code
def extend_read1_consensus(fastq_1, fastq_2, extended_sequence_name, extended_sequence_file): output_dir=os.path.dirname(fastq_1) out_extended = run_flash(output_dir, fastq_1, fastq_2, 20) if not out_extended: out_extended = run_flash(output_dir, fastq_1, fastq_2, 10) if not out_extended: return None command_array = ["cat %s | paste - - - - | cut -f 2 | sort | uniq -c | sort -nr |"%out_extended, " awk '{if($1>best){best=$1;if (length($2)>length(longest)){longest=$2}}} END{print longest}' |", """awk 'BEGIN{print "%s"} {print $0}'"""%extended_sequence_name, " > %s"%extended_sequence_file] return_code = command_runner.run_command(' '.join(command_array)) if return_code !=0: return None if return_code !=0: return None return extended_sequence_file
def merge_bam_files_with_picard(list_of_file, output_file=None, **kwargs): """This is a generic merging function for bam files. It assumes that all the bam file comes from mapping to independent contigs""" if not output_file: #Create a generic name and put it in the current working directory working_directory=os.getcwd() i=1 output_file_template=os.path.join(working_directory,'tmp_merge_bam_%s.bam') output_file=output_file_template%i while os.path.exists(output_file): i+=1 output_file=output_file_template%i command = 'java -jar -Xmx2G %s VALIDATION_STRINGENCY=SILENT CAT_SEQUENCE_DICTIONARIES=True USE_THREADING=True O=%s '%(mergeSamFilesWithCat_jar,output_file) inputs=['I=%s'%file for file in list_of_file] command += ' '.join(inputs) return_code=command_runner.run_command(command) if return_code==0: return output_file else: return None
def clean_fastq(fastq_file, adapter_file=None, rg_ids=[], subsample_nb_read=None): if rg_ids: fastq_file = keep_read_from_samples(fastq_file, rg_ids) if adapter_file: adapter_trim = fastq_file + '.adapter_trimmed' if not os.path.exists(adapter_trim): command = "scythe -q sanger -a %s -o %s %s" % (adapter_file, adapter_trim, fastq_file) command_runner.run_command(command) fastq_file = adapter_trim qual_trim = fastq_file + ".qual_trimmed" if not os.path.exists(qual_trim): command = "sickle se -f %s -t sanger -o %s" % (fastq_file, qual_trim) command_runner.run_command(command) fastq_file = qual_trim if subsample_nb_read: sub_sampled = qual_trim + ".%s" % subsample_nb_read if not os.path.exists(sub_sampled): command = "seqtk sample %s %s > %s" % (fastq_file, subsample_nb_read, sub_sampled) command_runner.run_command(command) return sub_sampled else: return fastq_file