def bam_validate(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) bam_name = os.path.basename(bam_path) validate_file = bam_path + '.validate' if pipe_util.already_step(step_dir, bam_name + '_validate', logger): logger.info('already completed step `validate` of: %s' % bam_path) else: logger.info('running step validate of: %s' % bam_path) home_dir = os.path.expanduser('~') mo = int((2 ** 32) / 2) - 1 cmd = ['java', '-d64', '-Xmx16G', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file] output = pipe_util.do_command(cmd, logger, allow_fail=True) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path df['validate_file'] = validate_file unique_key_dict = {'uuid': uuid, 'bam_path': bam_path, 'validate_file': validate_file} table_name = 'time_mem_picard_ValidateSamFile' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step validate of: %s' % bam_path) pipe_util.create_already_step(step_dir, bam_name + '_validate', logger) logger.info('completed running step `picard validate` of: %s' % bam_path) if pipe_util.already_step(step_dir, bam_name + '_validate_db', logger): logger.info('alread stored `picard validate` to db') else: logger.info('storing `picard validate` to db') store_validate_error(uuid, bam_path, validate_file, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_validate_db', logger) logger.info('completed storing `picard validate` to db')
def bam_validate(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) validate_file = bam_path + '.validate' if pipe_util.already_step(step_dir, 'validate', logger): logger.info('already completed step `validate` of: %s' % bam_path) else: logger.info('running step validate of: %s' % bam_path) home_dir = os.path.expanduser('~') mo = int((2 ** 32) / 2) - 1 cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file] output = pipe_util.do_command(cmd, logger, allow_fail=True) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_picard_ValidateSamFile' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step validate of: %s' % bam_path) pipe_util.create_already_step(step_dir, 'validate', logger) if pipe_util.already_step(step_dir, 'validate_db', logger): logger.info('alreaddy stored `picard validate` to db') else: logger.info('storing `picard validate` to db') store_validate_error(uuid, bam_path, validate_file, engine, logger) pipe_util.create_already_step(step_dir, 'validate_db', logger) logger.info('completed storing `picard validate` to db')
def bwa_aln_single(uuid,bam_path,fastq_dir,read1,realn_dir,readkey,reference_fasta_path, rg_str,thread_count,engine,logger): se_realn_dir=os.path.join(realn_dir,'bwa_aln_'+readkey) logger.info('se_realn_dir=%s' % se_realn_dir) logger.info('read1=%s' % read1) fastqbasename=read1.replace('_'+readkey+'.fq','') logger.info('fastqbasename=%s' % fastqbasename) outsai=os.path.basename(fastqbasename+'.sai') outbam=os.path.basename(fastqbasename+'.bam') outsai_path=os.path.join(se_realn_dir,outsai) outbam_path=os.path.join(se_realn_dir,outbam) read1_name,read1_ext=os.path.splitext(read1) sai1_name=read1_name+'.sai' sai1_path=os.path.join(pe_realn_dir,sai1_name) f1=os.path.join(fastq_dir,read1) os.makedirs(se_realn_dir,exist_ok=True) if pipe_util.already_step(se_realn_dir,readkey+'_sai_'+fastqbasename,logger): logger.info('already completed step `bwa aln` of: %s' % read1) else: aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',outsai_path] shell_aln_cmd=' '.join(aln_cmd) output=pipe_util.do_shell_command(shell_aln_cmd,logger) df=time_util.store_time(uuid,shell_aln_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) logger.info('completed running step `bwa mem single` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger) if pipe_util.already_step(se_realn_dir,readkey+'_samse_'+fastqbasename,logger): logger.info('already completed set `bwa samse` of %s:' % outbam_path) else: samse_cmd=['bwa','samse',reference_fasta_path,'-r '+'"'+rg_str+'"'] samtools_cmd='samtools view -Shb -o '+outbam_path+' -' shell_samse_cmd=' '.join(samse_cmd) shell_samtools_cmd=' '.join(samtools_cmd) shell_cmd=shell_samse_cmd+' | '+shell_samtools_cmd output=pipe_util.do_shell_command(shell_cmd,logger) df=time_util.store_time(uuid,shell_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_samse' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) logger.info('completed running step `bwa mem single` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger) return outbam_path
def RTC(uuid, analysis_ready_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger): RTC_dir = os.path.dirname(analysis_ready_bam_list_path) bam_list_name = os.path.basename(analysis_ready_bam_list_path) bam_base, bam_ext = os.path.splitext(bam_list_name) logger.info('RTC_dir=%s' % RTC_dir) step_dir = RTC_dir outintervals = bam_base + '.intervals' intervals_path = os.path.join(RTC_dir, outintervals) logger.info('intervals_path=%s' % intervals_path) if pipe_util.already_step(step_dir, uuid + '_RealignerTargetCreator', logger): logger.info('already completed step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) else: logger.info('running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nt ' + thread_count, '-T RealignerTargetCreator', '-R ' + reference_fasta_name, '-I ' + analysis_ready_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-o ' + intervals_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['intervals_path'] = intervals_path df['analysis_ready_bam_list_path'] = analysis_ready_bam_list_path df['thread_count'] = thread_count table_name = 'time_mem_GATK_RTC' unique_key_dict = {'uuid': uuid, 'analysis_ready_bam_list_path': analysis_ready_bam_list_path, 'thread_count': thread_count, 'intervals_path': intervals_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_RealignerTargetCreator', logger) logger.info('completed running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path) return intervals_path
def do_guess_encoding(uuid,fastq_path,engine,logger): fastq_name=os.path.basename(fastq_path) fastq_dir=os.path.dirname(fastq_path) fastq_base,fastq_ext=os.path.splitext(fastq_name) if pipe_util.already_step(fastq_dir,'guess_'+fastq_base,logger): logger.info('already completed step `guess_encoding`: %s' % fastq_path) else: logger.info('running step `guess_encoding` of %s' % fastq_path) pipe_dir=os.path.dirname(os.path.realpath(sys.argv[0])) guess_path=os.path.join(pipe_dir,'guess-encoding.py') guess_cmd='python2 '+guess_path time_cmd='/usr/bin/time -v '+guess_cmd+' -f '+fastq_path proc=subprocess.Popen(time_cmd,shell=True,stderr=subprocess.STDOUT,stdout=subprocess.PIPE) output=proc.communicate()[0] logger.info('output=%s' % output) df=time_util.store_time(uuid,time_cmd,output,logger) df['fastq_path']=fastq_path table_name='time_mem_guessencoding' unique_key_dict={'uuid':uuid,'fastq_path':fastq_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) #cmdlist=list() #cmdlist.append(awk_shlex) #cmdlist.append(guess_cmd) #output=pipe_util.do_piped_commands(cmdlist,logger) logger.info('do_guess_encoding output=%s' % output.decode()) write_fastq_format(fastq_path,output,logger) pipe_util.create_already_step(fastq_dir,'guess_'+fastq_base,logger) return
def get_file_md5(uuid,file_path,engine,logger): file_dir=os.path.dirname(file_path) file_name=os.path.basename(file_path) file_shortname,file_ext=os.path.splitext(file_name) file_md5_name=file_name+'.md5' file_md5_path=os.path.join(file_dir,file_md5_name) if pipe_util.already_step(file_dir,file_name+'_md5sum',logger): logger.info('already completed step `md5sum` of: %s' % file_path) with open(file_md5_path,'r') as file_md5_path_open: file_md5=file_md5_path_open.readline().strip() return file_md5 else: cmd=['md5sum',file_path] output=pipe_util.do_command(cmd,logger) file_md5=output.split()[0].decode() file_md5_path_open=open(file_md5_path,'w') file_md5_path_open.write(file_md5) file_md5_path_open.close() df=time_util.store_time(uuid,cmd,output,logger) df['file_path']=file_path logger.info('df=%s' % df) unique_key_dict={'uuid':uuid,'file_path':file_path} table_name='time_mem_md5' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(file_dir,file_name+'_md5sum',logger) return file_md5 return None
def HC(uuid, analysis_ready_bam_list_path, intervals, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger): HC_dir = os.path.dirname(analysis_ready_bam_list_path) logger.info('HC_dir=%s' % HC_dir) step_dir = HC_dir hc_output_gvcfs = [] with open(analysis_ready_bam_list_path) as f: analysis_ready_bam_path = f.read().splitlines() for bam in analysis_ready_bam_path: bam_name = os.path.basename(bam) bam_base, bam_ext = os.path.splitext(bam_name) out_gvcf = bam_base + '.raw.indels.raw.snps.g.vcf' out_gvcf_path = os.path.join(HC_dir, out_gvcf) logger.info('out_gvcf_path=%s' % out_gvcf_path) hc_output_gvcfs.append(out_gvcf_path) if pipe_util.already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger): logger.info('already completed step `HaplotypeCaller` of: %s' % bam) else: logger.info('running step `HaplotypeCaller` of: %s' % bam) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T HaplotypeCaller', '-R ' + reference_fasta_name, '-I ' + bam, '--emitRefConfidence GVCF', '--variant_index_type LINEAR', '--variant_index_parameter 128000', '--dbsnp ' + dbsnp_known_snp_sites, '-L ' + intervals, '--max_alternate_alleles 50', '-o ' + out_gvcf_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['out_gvcf_path'] = out_gvcf_path df['analysis_ready_bam_path'] = bam df['thread_count'] = thread_count table_name = 'time_mem_GATK_HaplotypeCaller' unique_key_dict = {'uuid': uuid, 'analysis_ready_bam_path': bam, 'thread_count': thread_count, 'out_gvcf': out_gvcf_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger) logger.info('completed running step `HaplotypeCaller` of: %s' % bam) return hc_output_gvcfs
def picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger): sd_dir = os.path.dirname(reference_fasta_name) ref_name = os.path.basename(reference_fasta_name) ref_base, ref_ext = os.path.splitext(ref_name) sd_file = ref_base + ".dict" sd_file_path = os.path.join(sd_dir, sd_file) if pipe_util.already_step(sd_dir, ref_name + "_dict", logger): logger.info("already completed step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) else: logger.info("running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) home_dir = os.path.expanduser("~") picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar") cmd = [ "java", "-d64", "-Xmx16G", "-jar", picard_path, "CreateSequenceDictionary", "R=" + reference_fasta_name, "O=" + sd_file_path, ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["reference_fasta"] = reference_fasta_name df["sequence_dictionary"] = sd_file_path unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "sequence_dictionary": sd_file_path} table_name = "time_mem_picard_CreateSequenceDictionary" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name) pipe_util.create_already_step(sd_dir, ref_name + "_dict", logger) return sd_file_path
def guess_enc_db(uuid, fq_path, engine, logger): fastq_dir = os.path.dirname(fq_path) fastq_name = os.path.basename(fq_path) fastq_base, fastq_ext = os.path.splitext(fastq_name) guess_enc_path = fq_path + '.format' guess_enc_value = str() with open(guess_enc_path, 'r') as guess_enc_open: guess_enc_value = guess_enc_open.readline().strip() data_dict = dict() if pipe_util.already_step(fastq_dir, 'fastq_encdb_' + fastq_base, logger): logger.info('writing `guess_enc_db`: %s' % fq_path) else: logger.info('writing `guess_enc_db`: %s' % fq_path) data_dict['uuid'] = [uuid] data_dict['fastq_name'] = fastq_name data_dict['guess'] = guess_enc_value df = pd.DataFrame(data_dict) table_name = 'guess_fastq_encoding' unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastq_encdb_' + fastq_base, logger) logger.info('completed writing `guess_enc_db`: %s' % fq_path) return
def sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger): sump_dir = os.path.dirname(muse_call_output_path) input_name = os.path.basename(muse_call_output_path) input_base, input_ext = os.path.splitext(input_name) sample_base, sample_ext = os.path.splitext(input_base) logger.info('MuSE_sump_dir=%s' % sump_dir) step_dir = sump_dir muse_sump_output = input_base + '.vcf' muse_sump_output_path = os.path.join(sump_dir, muse_sump_output) logger.info('muse_sump_output_path=%s' % muse_sump_output_path) if pipe_util.already_step(step_dir, sample_base + '_MuSE_sump', logger): logger.info('already completed step `MuSE sump` of: %s' % input_name) else: logger.info('running step `MuSE sump` of the tumor bam: %s' % input_name) home_dir = os.path.expanduser('~') muse_path = os.path.join(home_dir, 'tools', 'MuSEv1.0rc_submission_c039ffa') cmd = [muse_path, 'sump', '-I', muse_call_output_path, '-E', '-O', muse_sump_output_path, '-D', dbsnp_known_snp_sites] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['muse_call_output'] = muse_call_output_path df['muse_sump_output'] = muse_sump_output_path unique_key_dict = {'uuid': uuid, 'muse_call_output': muse_call_output_path, 'muse_sump_output': muse_sump_output_path} table_name = 'time_mem_MuSE_sump_wxs' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, sample_base + '_MuSE_sump', logger) logger.info('completed running `MuSE sump` of the tumor bam: %s' % input_name) return muse_sump_output_path
def bam_to_fastq(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) uuid_dir = step_dir logger.info('uuid_dir is: %s' % uuid_dir) fastq_dir = os.path.join(uuid_dir, 'fastq') logger.info('fastq_dir is: %s' % fastq_dir) if pipe_util.already_step(fastq_dir, 'fastq', logger): logger.info('already completed step `bamtofastq` of: %s' % bam_path) else: logger.info('running step `bamtofastq` of %s: ' % bam_path) os.makedirs(fastq_dir, exist_ok=True) tempfq = os.path.join(fastq_dir, 'tempfq') cmd = [ 'bamtofastq', 'S=%s' % uuid + '.fq', 'filename=' + bam_path, 'outputdir=' + fastq_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=' + tempfq, 'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY' ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_bamtofastq' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastq', logger) logger.info('completed running step `bamtofastq` of: %s' % bam_path) return
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient): out_bam_path_list = list() for input_bam in bam_path_list: bam_name = os.path.basename(input_bam) bam_base, bam_ext = os.path.splitext(bam_name) input_dir = os.path.dirname(input_bam) outdir_path = os.path.join(input_dir, 'sorted') outbam_path = os.path.join(outdir_path, bam_name) tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name) logger.info('outbam_path=%s' % outbam_path) out_bam_path_list.append(outbam_path) if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger): logger.info('already completed step `picard sort` of: %s' % bam_name) else: logger.info('running step `picard sort` of: %s' % bam_name) os.makedirs(outdir_path, exist_ok=True) home_dir = os.path.expanduser('~') cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path] if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = {'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path} table_name = 'time_mem_picard_bamsort' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger) logger.info('completed running step `picard sort` of: %s' % bam_name) return out_bam_path_list
def PR(uuid, harmonized_IR_bam_path, thread_count, reference_fasta_name, BQSR_table_path, engine, logger): PR_dir = os.path.dirname(harmonized_IR_bam_path) bam_name = os.path.basename(harmonized_IR_bam_path) bam_base, bam_ext = os.path.splitext(bam_name) logger.info('PR_dir=%s' % PR_dir) step_dir = PR_dir out_BQSR_bam = bam_base + '_BQSR' + bam_ext BQSR_bam_path = os.path.join(PR_dir, out_BQSR_bam) logger.info('BQSR_bam_path=%s' % BQSR_bam_path) if pipe_util.already_step(step_dir, bam_name + '_PrintReads', logger): logger.info('already completed step `PrintReads` of: %s' % harmonized_IR_bam_path) else: logger.info('running step `PrintReads` of: %s' % harmonized_IR_bam_path) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T PrintReads', '-R ' + reference_fasta_name, '-I ' + harmonized_IR_bam_path, '-BQSR ' + BQSR_table_path, '-o ' + BQSR_bam_path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['BQSR_bam_path'] = BQSR_bam_path df['harmonized_IR_bam_path'] = harmonized_IR_bam_path df['thread_count'] = thread_count table_name = 'time_mem_GATK_PR' unique_key_dict = {'uuid': uuid, 'harmonized_IR_bam_path': harmonized_IR_bam_path, 'thread_count': thread_count, 'BQSR_bam_path': BQSR_bam_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_PrintReads', logger) logger.info('completed running step `PrintReads` of: %s' % harmonized_IR_bam_path) return BQSR_bam_path
def do_guess_encoding(uuid, fastq_path, engine, logger): fastq_name = os.path.basename(fastq_path) fastq_dir = os.path.dirname(fastq_path) fastq_base, fastq_ext = os.path.splitext(fastq_name) if pipe_util.already_step(fastq_dir, 'guess_' + fastq_base, logger): logger.info('already completed step `guess_encoding`: %s' % fastq_path) else: logger.info('running step `guess encoding` of %s' % fastq_path) pipe_dir = os.path.dirname(os.path.realpath(sys.argv[0])) guess_path = os.path.join(pipe_dir, 'guess-encoding.py') guess_cmd = 'python2 ' + guess_path time_cmd = '/usr/bin/time -v ' + guess_cmd + ' -f ' + fastq_path proc = subprocess.Popen(time_cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) output = proc.communicate()[0] logger.info('output=%s' % output) df = time_util.store_time(uuid, time_cmd, output, logger) df['fastq_path'] = fastq_path table_name = 'time_mem_guessencoding' unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('do_guess_encoding output=%s' % output.decode()) write_fastq_format(fastq_path, output, logger) pipe_util.create_already_step(fastq_dir, 'guess_' + fastq_base, logger) return
def bam_to_fastq(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) uuid_dir = step_dir logger.info("uuid_dir is: %s" % uuid_dir) fastq_dir = os.path.join(uuid_dir, "fastq") logger.info("fastq_dir is: %s" % fastq_dir) if pipe_util.already_step(fastq_dir, "fastq", logger): logger.info("already completed step `bamtofastq` of: %s" % bam_path) else: logger.info("running step `bamtofastq` of %s: " % bam_path) os.makedirs(fastq_dir, exist_ok=True) tempfq = os.path.join(fastq_dir, "tempfq") cmd = [ "bamtofastq", "S=%s" % uuid + ".fq", "filename=" + bam_path, "outputdir=" + fastq_dir, "tryoq=1", "collate=1", "outputperreadgroup=1", "T=" + tempfq, "exclude=QCFAIL,SECONDARY,SUPPLEMENTARY", ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["bam_path"] = bam_path unique_key_dict = {"uuid": uuid, "bam_path": bam_path} table_name = "time_mem_bamtofastq" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, "fastq", logger) logger.info("completed running step `bamtofastq` of: %s" % bam_path) return
def bam_mark_duplicates(uuid,bam_path,thread_count,engine,logger): merge_dir=os.path.dirname(bam_path) merge_parent_dir=os.path.dirname(merge_dir) md_dir=os.path.join(merge_parent_dir,'md') os.makedirs(md_dir,exist_ok=True) logger.info('md_dir=%s' % md_dir) step_dir=md_dir outbam=os.path.basename(bam_path) outbam_path=os.path.join(md_dir,outbam) logger.info('outbam_path=%s' % outbam_path) if pipe_util.already_step(step_dir,'markduplicates',logger): logger.info('already completed step `markduplicates` of: %s' % bam_path) else: logger.info('running step `merge of: %s' % bam_path) tmpfile=os.path.join(md_dir,'tmpfile_md') cmd=['bammarkduplicates2','markthreads='+thread_count,'rmdup=0','md5=1','index=1','level=-1','tmpfile='+tmpfile,'I='+bam_path,'O='+outbam_path] output=pipe_util.do_command(cmd,logger) #store time/mem to db df=time_util.store_time(uuid,cmd,output,logger) df['bam_path']=bam_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'thread_count':thread_count} table_name='time_mem_bammarkduplicates2' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'markduplicates',logger) logger.info('completed running step `markduplicates` of: %s' % bam_path) return outbam_path
def bam_merge(uuid,bam_path,bam_path_list,engine,logger): uuid_dir=os.path.dirname(bam_path) sort_dir=os.path.dirname(bam_path_list[0]) sort_parent_dir=os.path.dirname(sort_dir) merge_dir=os.path.join(sort_parent_dir,'merge') os.makedirs(merge_dir,exist_ok=True) step_dir=merge_dir outbam=os.path.basename(bam_path) outbam_path=os.path.join(merge_dir,outbam) logger.info('bam_path_list=%s' % bam_path_list) if pipe_util.already_step(step_dir,'merge',logger): logger.info('already completed step `merge` of: %s' % bam_path) else: logger.info('running step `merge of: %s' % bam_path) tmpfile=os.path.join(merge_dir,'tmpfile') cmd=['bammerge','SO=coordinate','level=-1','tmpfile='+tmpfile,'index=1'] for input_bam in bam_path_list: input_string='I='+input_bam cmd.append(input_string) output=pipe_util.do_stdout_command(cmd,logger,stdout=outbam_path) #save time/mem to db df=time_util.store_time(uuid,cmd,output,logger) df['bam_path']=bam_path unique_key_dict={'uuid':uuid,'bam_path':bam_path} table_name='time_mem_bammerge' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'merge',logger) logger.info('completed running step `merge` of: %s' % bam_path) return outbam_path
def bwa_aln_paired(bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,logger): pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe') logger.info('pe_realn_dir=%s' % pe_realn_dir) logger.info('read1=%s' % read1) logger.info('read2=%s' % read2) fastqbasename=read1.replace('_1.fq','') logger.info('fastqbasename=%s' % fastqbasename) outbam=os.path.basename(fastqbasename+'.bam') outbam_path=os.path.join(se_realn_dir,outbam) if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger): logger.info('already completed step `bwa aln paired` of: %s' % bam_path) else: os.makedirs(pe_realn_dir,exist_ok=True) f1_path=os.path.join(fastq_dir,read1) f2_path=os.path.join(fastq_dir,read2) sai1=fastqbasename+'_1.sai' sai2=fastqbasename+'_2.sai' sai1_path=os.path.join(pe_realn_dir,sai1) sai2_path=os.path.join(pe_realn_dir,sai2) bwa_aln_cmd1=['bwa','aln','-t 24',reference_fasta_path,f1_path] bwa_aln_cmd2=['bwa','aln','-t 24',reference_fasta_path,f2_path] sai1_open=open(sai1_path,'wb') pipe_util.do_command(bwa_aln_cmd1,logger,stdout=sai1_open,stderr=subprocess.PIPE) sai1_open.close() sai2_open=open(sai2_path,'wb') pipe_util.do_command(bwa_aln_cmd2,logger,stdout=sai2_open,stderr=subprocess.PIPE) sai2_open.close() bwa_aln_sampe_cmd=['bwa','sampe','-a 500',reference_fasta_path,sai1_path,sai2_path,f1_path,f2_path] samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] cmdlist=list() cmdlist.append(bwa_aln_sampe_cmd) cmdlist.append(samtools_cmd) pipe_util.do_piped_commands(cmdlist,logger) pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger) return outbam_path
def run_hc(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger): vcf_dir=os.path.join(scratch_dir,uuid,'hc') os.makedirs(vcf_dir,exist_ok=True) logger.info('hc vcf_dir=%s' % vcf_dir) bamname=os.path.basename(bam_path) bambase,bamext=os.path.splitext(bamname) outvcf=bambase+'.vcf' vcf_path=os.path.join(vcf_dir,outvcf) logger.info('vcf_path=%s' % vcf_path) home_dir=os.path.expanduser('~') if pipe_util.already_step(vcf_dir,'hc_'+bambase,logger): logger.info('already completed step `HaplotypeCaller` of: %s' % bam_path) else: #do work gatk_path=os.path.join(home_dir,'bin','GenomeAnalysisTK.jar') tmp_dir=os.path.join(scratch_dir,'tmp') shellcmd='java -d64 -Djava.io.tmpdir='+tmp_dir+' -jar '+gatk_path+' --analysis_type HaplotypeCaller --generate_md5 -nct '+thread_count+' --output_mode EMIT_VARIANTS_ONLY --input_file ' + bam_path + ' --reference_sequence ' + reference_fasta_path+' --out '+vcf_path #+' -L "1:500000-900000"' logger.info('shellcmd=%s' % shellcmd) cmd=shlex.split(shellcmd) logger.info('cmd=%s' % cmd) output=pipe_util.do_command(cmd,logger) #store timing/mem results in db. uuid+vcf_path are unique key df=time_util.store_time(uuid,cmd,output,logger) df['vcf_path']=vcf_path logger.info('df=%s' % df) table_name='time_mem_gatk_hc' #variable, consider making a parameter unique_key_dict={'uuid':uuid,'vcf_path':vcf_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) #done flag pipe_util.create_already_step(vcf_dir,'hc_'+bambase,logger) return
def do_picard_collectwgsmetrics(uuid,bam_path,reference_fasta_path,engine,logger): step_dir=os.path.dirname(bam_path) bam_name=os.path.basename(bam_path) bam_base,bam_ext=os.path.splitext(bam_name) home_dir=os.path.expanduser('~') picard_dir=os.path.join(home_dir,'tools','picard-tools') stats_outfile='picard_collectwgsmetrics_'+bam_base+'.txt' stats_path=os.path.join(step_dir,stats_outfile) if pipe_util.already_step(step_dir,'picard_collectwgsmetrics',logger): logger.info('already completed step `picard_collectwgsmetrics` of: %s' % bam_path) else: logger.info('running step `picard_collectwgsmetrics` of: %s' % bam_path) cmd=['java','-d64','-jar',os.path.join(picard_dir,'picard.jar'),'CollectWgsMetrics','INPUT='+bam_path,'OUTPUT='+stats_path,'REFERENCE_SEQUENCE='+reference_fasta_path,'INCLUDE_BQ_HISTOGRAM=true','VALIDATION_STRINGENCY=LENIENT'] picard_cwgsm_output=pipe_util.do_command(cmd,logger) #with open(stats_path,'w') as stats_path_open: # for aline in stats_output.decode().format(): # stats_path_open.write(aline) #save time/mem to db df=time_util.store_time(uuid,cmd,picard_cwgsm_output,logger) df['bam_path']=bam_path df['reference_fasta_path']=reference_fasta_path unique_key_dict={'uuid':uuid,'bam_path':bam_path} table_name='time_mem_picard_cwgsm' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics',logger) logger.info('completed running step `picard_collectwgsmetrics` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir,'picard_collectwgsmetrics_db',logger): logger.info('already stored `picard collectwgsmetrics` of %s to db' % bam_path) else: data_dict=picard_wgs_to_dict(uuid,bam_path,stats_path,logger) data_dict['uuid']=[uuid] data_dict['bam_path']=bam_path data_dict['reference_fasta_path']=reference_fasta_path df=pd.DataFrame(data_dict) table_name='picard_collectwgsmetrics' unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics_db',logger) logger.info('completed storing `picard collectwgsmetrics` to db') return
def do_samtools_stats(uuid,bam_path,reference_fasta_path,engine,logger): step_dir=os.path.dirname(bam_path) bam_name=os.path.basename(bam_path) bam_base,bam_ext=os.path.splitext(bam_name) stats_outfile='stats_'+bam_base+'.txt' stats_path=os.path.join(step_dir,stats_outfile) if pipe_util.already_step(step_dir,'samtools_stats',logger): logger.info('already completed step `samtools stats` of: %s' % bam_path) else: logger.info('running step `samtools stats` of: %s' % bam_path) cmd=['samtools','stats',bam_path] stats_output=pipe_util.do_command(cmd,logger) with open(stats_path,'w') as stats_path_open: for aline in stats_output.decode().format(): stats_path_open.write(aline) #save time/mem to db df=time_util.store_time(uuid,cmd,stats_output,logger) df['bam_path']=bam_path df['reference_fasta_path']=reference_fasta_path unique_key_dict={'uuid':uuid,'bam_path':bam_path} table_name='time_mem_samtools_stats' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'samtools_stats',logger) logger.info('completed running step `samtools stats` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir,'samtools_stats_db',logger): logger.info('already stored `samtools stats` of %s to db' % bam_path) else: data_dict=samtools_stats_to_dict(uuid,bam_path,stats_path,logger) data_dict['uuid']=[uuid] data_dict['bam_path']=bam_path data_dict['reference_fasta_path']=reference_fasta_path df=pd.DataFrame(data_dict) table_name='samtools_stats' unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(step_dir,'samtools_stats_db',logger) logger.info('completed storing `samtools stats` to db') return
def pull_cgquery_xml_to_file(uuid,outputxml,logger): file_dir=os.path.dirname(outputxml) if pipe_util.already_step(file_dir,'cgquery_xml',logger): logger.info('already completed step `cgquery` of: %s' % uuid) return else: logger.info('running command `cgquery` of: %s' % uuid) cmd=['cgquery','-a','analysis_id='+uuid,'-o',outputxml] output=pipe_util.do_command(cmd,logger) pipe_util.create_already_step(file_dir,'cgquery_xml',logger) return
def write_readgroups(uuid,bam_path,engine,logger): bam_dir=os.path.dirname(bam_path) samfile=pysam.AlignmentFile(bam_path,'rb') readgroups=samfile.header['RG'] readgroup_path_dict=dict() for readgroup in readgroups: rg_id=readgroup['ID'] outfile=rg_id+'.RG' outfile_path=os.path.join(bam_dir,outfile) readgroup_path_dict[rg_id]=outfile_path if pipe_util.already_step(bam_dir,readgroup['ID']+'_rg_file',logger): logger.info('already wrote @RG to: %s' % outfile_path) else: outfile_open=open(outfile_path,'w') outstring='@RG' for rg_key in sorted(readgroup.keys()): outstring+='\\t'+rg_key+':'+readgroup[rg_key] outfile_open.write(outstring) outfile_open.close() pipe_util.create_already_step(bam_dir,readgroup['ID']+'_rg_file',logger) logger.info('readgroup_path_dict=%s' % readgroup_path_dict) #store @RG to db for readgroup in readgroups: if pipe_util.already_step(bam_dir,readgroup['ID']+'_rg_db',logger): logger.info('already wrote %s to db' % readgroup['ID']) else: readgroup['uuid']=[uuid]#or 'ValueError: If using all scalar values, you must pass an index' table_name='readgroups' for rg_key in sorted(readgroup.keys()): rg_dict=dict() rg_dict['uuid']=[uuid] rg_dict['ID']=readgroup['ID'] rg_dict['key']=rg_key rg_dict['value']=readgroup[rg_key] df=pd.DataFrame(rg_dict) unique_key_dict={'uuid':uuid,'ID':readgroup['ID'],'key':rg_key} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(bam_dir,readgroup['ID']+'_rg_db',logger) logger.info('completed storing @RG %s to db' % readgroup['ID']) return readgroup_path_dict
def bwa_mem_single( uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, thread_count, engine, logger ): se_realn_dir = os.path.join(realn_dir, "bwa_mem_" + readkey) logger.info("se_realn_dir=%s" % se_realn_dir) logger.info("read1=%s" % read1) fastqbasename = read1.replace("_" + readkey + ".fq", "") logger.info("fastqbasename=%s" % fastqbasename) outbam = os.path.basename(fastqbasename + ".bam") outbam_path = os.path.join(se_realn_dir, outbam) if pipe_util.already_step(se_realn_dir, readkey + "_" + fastqbasename, logger): logger.info("already completed step `bwa mem single` of: %s" % bam_path) else: os.makedirs(se_realn_dir, exist_ok=True) f1 = os.path.join(fastq_dir, read1) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1 ##shlex_bwa_cmd=shlex.split(bwa_cmd) bwa_cmd = [ "bwa", "mem", "-t " + thread_count, "-p", "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, ] # samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] samtools_cmd = "samtools view -Shb -o " + outbam_path + " -" shell_bwa_cmd = " ".join(bwa_cmd) shell_samtools_cmd = " ".join(samtools_cmd) shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd # shlex_samtools_cmd=shlex.split(samtools_cmd) # cmdlist=list() # cmdlist.append(shlex_bwa_cmd) # cmdlist.append(shlex_samtools_cmd) # output=pipe_util.do_piped_commands(cmdlist,logger) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df["bam_path"] = outbam_path df["reference_fasta_path"] = reference_fasta_path df["thread_count"] = thread_count unique_key_dict = { "uuid": uuid, "bam_path": bam_path, "reference_fasta_path": reference_fasta_path, "thread_count": thread_count, } table_name = "time_mem_bwa_mem_se" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `bwa mem single` of: %s" % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + "_" + fastqbasename, logger) return outbam_path
def bam_to_fastq(uuid,bam_path,engine,logger): uuid_dir=os.path.dirname(bam_path) logger.info('uuid_dir is: %s' % uuid_dir) fastq_dir=os.path.join(uuid_dir,'fastq') logger.info('fastq_dir is: %s' % fastq_dir) if pipe_util.already_step(fastq_dir,'fastq',logger): logger.info('already completed step `bamtofastq` of: %s' % bam_path) else: logger.info('running step `bamtofastq` of %s: ' % bam_path) os.makedirs(fastq_dir,exist_ok=True) tempfq=os.path.join(fastq_dir,'tempfq') cmd=['bamtofastq','filename='+bam_path,'outputdir='+fastq_dir,'tryoq=1','collate=1','outputperreadgroup=1','T='+tempfq] pipe_util.do_command(cmd,logger) pipe_util.create_already_step(fastq_dir,'fastq',logger) return
def fastqc_to_db(uuid, fq_path, engine, logger): fastq_name = os.path.basename(fq_path) fastq_dir = os.path.dirname(fq_path) fastq_base, fastq_ext = os.path.splitext(fastq_name) fastq_base = fastq_base.rstrip('.fq') qc_report_dir = os.path.join(fastq_dir, fastq_base + '_fastqc') fastqc_data_path = os.path.join(qc_report_dir, 'fastqc_data.txt') fastqc_summary_path = os.path.join(qc_report_dir, 'summary.txt') if pipe_util.already_step(fastq_dir, 'fastqc_db_' + fastq_base, logger): logger.info('already completed step `fastqc db`: %s' % fq_path) else: logger.info('writing `fastqc db`: %s' % fq_path) summary_dict = dict() summary_dict['uuid'] = [uuid] summary_dict['fastq_name'] = fastq_name summary_dict = fastqc_summary_to_dict(summary_dict, fastqc_summary_path, engine, logger) df = pd.DataFrame(summary_dict) table_name = 'fastq_summary' unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) data_key_list = [ '>>Basic Statistics', '>>Per base sequence quality', '>>Per tile sequence quality', '>>Per sequence quality scores', '>>Per base sequence content', '>>Per sequence GC content', '>>Per base N content', '>>Sequence Length Distribution', '>>Sequence Duplication Levels', '>>Overrepresented sequences', '>>Adapter Content', '>>Kmer Content' ] for data_key in data_key_list: df = fastqc_detail_to_df(uuid, fq_path, fastqc_data_path, data_key, engine, logger) if df is None: continue table_name = 'fastqc_data_' + '_'.join( data_key.lstrip('>>').strip().split(' ')) logger.info('fastqc_to_db() table_name=%s' % table_name) unique_key_dict = {'uuid': uuid, 'fastq_path': fq_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastqc_db_' + fastq_base, logger) logger.info('completed writing `fastqc db`: %s' % fq_path) return
def bwa_mem_paired( uuid, bam_path, fastq_dir, read1, read2, realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger ): pe_realn_dir = os.path.join(realn_dir, "bwa_mem_pe") logger.info("pe_realn_dir=%s" % pe_realn_dir) logger.info("read1=%s" % read1) logger.info("read2=%s" % read2) fastqbasename = read1.replace("_1.fq", "") logger.info("fastqbasename=%s" % fastqbasename) outbam = os.path.basename(fastqbasename + ".bam") outbam_path = os.path.join(pe_realn_dir, outbam) if pipe_util.already_step(pe_realn_dir, "pe_" + fastqbasename, logger): logger.info("already completed step `bwa mem paired` of: %s" % bam_path) else: os.makedirs(pe_realn_dir, exist_ok=True) f1 = os.path.join(fastq_dir, read1) f2 = os.path.join(fastq_dir, read2) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2 ##shlex_bwa_cmd=shlex.split(bwa_cmd) # bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2] bwa_cmd = ["bwa", "mem", "-t " + thread_count, "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, f2] samtools_cmd = ["samtools", "view", "-Shb", "-o", outbam_path, "-"] shell_bwa_cmd = " ".join(bwa_cmd) shell_samtools_cmd = " ".join(samtools_cmd) shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -' ##shlex_samtools_cmd=shlex.split(samtools_cmd) # cmdlist=list() # cmdlist.append(bwa_cmd) # cmdlist.append(samtools_cmd) # output=pipe_util.do_piped_commands(cmdlist,logger) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df["bam_path"] = outbam_path df["reference_fasta_path"] = reference_fasta_path df["thread_count"] = thread_count unique_key_dict = { "uuid": uuid, "bam_path": bam_path, "reference_fasta_path": reference_fasta_path, "thread_count": thread_count, } table_name = "time_mem_bwa_mem_pe" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(pe_realn_dir, "pe_" + fastqbasename, logger) return outbam_path
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient): out_bam_path_list = list() for input_bam in bam_path_list: bam_name = os.path.basename(input_bam) bam_base, bam_ext = os.path.splitext(bam_name) input_dir = os.path.dirname(input_bam) outdir_path = os.path.join(input_dir, 'sorted') outbam_path = os.path.join(outdir_path, bam_name) tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name) logger.info('outbam_path=%s' % outbam_path) out_bam_path_list.append(outbam_path) if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger): logger.info('already completed step `picard sort` of: %s' % bam_name) else: logger.info('running step `picard sort` of: %s' % bam_name) os.makedirs(outdir_path, exist_ok=True) home_dir = os.path.expanduser('~') cmd = [ 'java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path ] if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = { 'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path } table_name = 'time_mem_picard_bamsort' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger) logger.info('completed running step `picard sort` of: %s' % bam_name) return out_bam_path_list
def do_fastqc(uuid, fastq_path, engine, logger): fastq_name = os.path.basename(fastq_path) fastq_dir = os.path.dirname(fastq_path) fastq_base, fastq_ext = os.path.splitext(fastq_name) if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger): logger.info('already completed step `fastqc`: %s' % fastq_path) else: logger.info('running step `fastqc`: %s' % fastq_path) cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract', fastq_path] # fix the path here output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['fastq_path'] = fastq_path table_name = 'time_mem_fastqc' unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base, logger) return
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger, be_lenient): sorted_bam_dir = os.path.dirname(bam_path_list[0]) bwa_alignment_dir = os.path.dirname(sorted_bam_dir) realn_dir = os.path.dirname(bwa_alignment_dir) out_dir = os.path.join(realn_dir, 'merge') os.makedirs(out_dir, exist_ok=True) step_dir = out_dir preharmbam = os.path.basename(preharmonize_bam_path) preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam) outbam_name = preharmbam_name + '_gdc_realn.bam' outbam_path = os.path.join(out_dir, outbam_name) logger.info('bam_path_list=%s' % bam_path_list) lenient_merge = False if pipe_util.already_step(step_dir, 'picard_merge', logger): logger.info('already completed step `merge` of: %s' % outbam_path) else: logger.info('running step `picard merge of: %s' % outbam_path) #tmpfile=os.path.join(merge_dir,'tmpfile') home_dir = os.path.expanduser('~') cmd = [ 'java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true', 'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path, 'TMP_DIR=' + out_dir ] for input_bam in bam_path_list: input_string = 'INPUT=' + input_bam cmd.append(input_string) if be_lenient: cmd.append('VALIDATION_STRINGENCY=LENIENT') output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = outbam_path unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name} table_name = 'time_mem_picard_bam_merge' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'picard_merge', logger) logger.info('completed running step `picard merge` of: %s' % outbam_path) return outbam_path
def get_fastq_size(fq_path, logger): fq_dir = os.path.dirname(fq_path) fq_name = os.path.basename(fq_path) size_file = os.path.join(fq_dir, fq_name + '.size') if pipe_util.already_step(fq_dir, fq_name + 'size', logger): with open(size_file, 'r') as size_open: size_str = size_open.readline() size_value = int(size_str) logger.info('already determined size of fq %s: %s' % (fq_name, str(size_value))) return size_value else: logger.info('determining size of fq %s' % fq_name) size_value = os.path.getsize(fq_path) with open(size_file, 'w') as size_open: size_open.write(str(size_value)) pipe_util.create_already_step(fq_dir, fq_name + 'size', logger) logger.info('determined size of fq %s: %s' % (fq_name, str(size_value))) return size_value
def store_md5_size(uuid,file_path,engine,logger): file_dir=os.path.dirname(file_path) file_name=os.path.basename(file_path) if pipe_util.already_step(file_dir,file_name+'_store_md5_size',logger): logger.info('already_completed step store md5_size of: %s' % file_path) else: file_md5=get_file_md5(uuid,file_path,engine,logger) file_size=get_file_size(uuid,file_path,engine,logger) df=pd.DataFrame({'uuid':[uuid], 'file_path':file_path, 'file_size':file_size, 'file_md5':file_md5}) logger.info('df=%s' % df) table_name='file_size_md5' unique_key_dict={'uuid':uuid,'file_path':file_path} df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(file_dir,file_name+'_store_md5_size',logger) return
def samtools_bam_index(uuid, bam_path, engine, logger): bam_file = os.path.basename(bam_path) bam_name, bam_ext = os.path.splitext(bam_file) out_dir = os.path.dirname(bam_path) bai_path = bam_path + '.bai' if pipe_util.already_step(out_dir, bam_name + '_index', logger): logger.info('already completed step `samtools index` of %s' % bam_path) else: logger.info('running step `samtools index` of %s' % bam_path) cmd = ['samtools', 'index', bam_path] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_samtools_index' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running `samtools index` of %s' % bam_path) return bai_path
def samtools_bam_index(uuid, bam_path, engine, logger): bam_file = os.path.basename(bam_path) bam_name, bam_ext = os.path.splitext(bam_file) out_dir = os.path.dirname(bam_path) bai_path = bam_path + ".bai" if pipe_util.already_step(out_dir, bam_name + "_index", logger): logger.info("already completed step `samtools index` of %s" % bam_path) else: logger.info("running step `samtools index` of %s" % bam_path) cmd = ["samtools", "index", bam_path] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["bam_path"] = bam_path unique_key_dict = {"uuid": uuid, "bam_path": bam_path} table_name = "time_mem_samtools_index" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running `samtools index` of %s" % bam_path) return bai_path
def samtools_faidx(uuid, reference_fasta_name, engine, logger): ref_file = os.path.basename(reference_fasta_name) fai_path = reference_fasta_name + ".fai" out_dir = os.path.dirname(reference_fasta_name) if pipe_util.already_step(out_dir, ref_file + "_faidx", logger): logger.info("already completed step `samtools faidx` of %s" % reference_fasta_name) else: logger.info("running step `samtools faidx` of %s" % reference_fasta_name) cmd = ["samtools", "faidx", reference_fasta_name] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["reference_fasta"] = reference_fasta_name df["fai_path"] = fai_path unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "fai_path": fai_path} table_name = "time_mem_samtools_faidx" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(out_dir, ref_file + "_faidx", logger) logger.info("completed running `samtools faidx` of %s" % reference_fasta_name) return fai_path
def picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger): sd_dir = os.path.dirname(reference_fasta_name) ref_name = os.path.basename(reference_fasta_name) ref_base, ref_ext = os.path.splitext(ref_name) sd_file = ref_base + ".dict" sd_file_path = os.path.join(sd_dir, sd_file) if os.path.isfile(sd_file_path): logger.info("reference_dict_path=%s" % sd_file_path) else: sd_file_path = picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger) logger.info("reference_dict_path=%s" % sd_file_path) srt_dir = os.path.dirname(muse_vcf) vcf_name = os.path.basename(muse_vcf) vcf_base, vcf_ext = os.path.splitext(vcf_name) srt_vcf = vcf_base + ".srt" + vcf_ext srt_vcf_path = os.path.join(srt_dir, srt_vcf) if pipe_util.already_step(srt_dir, vcf_name + "_sorted", logger): logger.info("already completed step `Picard SortVcf` of %s" % muse_vcf) else: logger.info("running step `Picard SortVcf` of %s" % muse_vcf) home_dir = os.path.expanduser("~") picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar") cmd = [ "java", "-d64", "-Xmx16G", "-jar", picard_path, "SortVcf", "I=" + muse_vcf, "O=" + srt_vcf_path, "SD=" + sd_file_path, ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["MuSE_VCF"] = muse_vcf df["MuSE_sorted_VCF"] = srt_vcf_path unique_key_dict = {"uuid": uuid, "MuSE_VCF": muse_vcf, "MuSE_sorted_VCF": srt_vcf_path} table_name = "time_mem_picard_SortVcf" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info("completed running step `Picard SortVcf` of %s" % muse_vcf) pipe_util.create_already_step(srt_dir, vcf_name + "_sorted", logger) return srt_vcf_path
def bwa_mem_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path, rg_str,thread_count,engine,logger): pe_realn_dir=os.path.join(realn_dir,'bwa_mem_pe') logger.info('pe_realn_dir=%s' % pe_realn_dir) logger.info('read1=%s' % read1) logger.info('read2=%s' % read2) fastqbasename=read1.replace('_1.fq','') logger.info('fastqbasename=%s' % fastqbasename) outbam=os.path.basename(fastqbasename+'.bam') outbam_path=os.path.join(pe_realn_dir,outbam) if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger): logger.info('already completed step `bwa mem paired` of: %s' % bam_path) else: os.makedirs(pe_realn_dir,exist_ok=True) f1=os.path.join(fastq_dir,read1) f2=os.path.join(fastq_dir,read2) ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2 ##shlex_bwa_cmd=shlex.split(bwa_cmd) #bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2] bwa_cmd=['bwa','mem','-t '+thread_count,'-T 0','-R '+'"'+rg_str+'"',reference_fasta_path,f1,f2] samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-'] shell_bwa_cmd=' '.join(bwa_cmd) shell_samtools_cmd=' '.join(samtools_cmd) shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -' ##shlex_samtools_cmd=shlex.split(samtools_cmd) #cmdlist=list() #cmdlist.append(bwa_cmd) #cmdlist.append(samtools_cmd) #output=pipe_util.do_piped_commands(cmdlist,logger) output=pipe_util.do_shell_command(shell_cmd,logger) df=time_util.store_time(uuid,shell_cmd,output,logger) df['bam_path']=outbam_path df['reference_fasta_path']=reference_fasta_path df['thread_count']=thread_count unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path, 'thread_count':thread_count} table_name='time_mem_bwa_mem_pe' df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger) pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger) return outbam_path
def write_readgroups(uuid, bam_path, engine, logger): step_dir = os.path.dirname(bam_path) if pipe_util.already_step(step_dir, 'readgroups', logger): logger.info('already extracted readgroups from %s' % bam_path) readgroup_path_list = glob.glob(os.path.join(step_dir, '*.RG')) readgroup_path_dict = dict() for readgroup_path in readgroup_path_list: readgroup_file = os.path.basename(readgroup_path) readgroup = readgroup_file.rstrip('.RG') readgroup_path_dict[readgroup] = readgroup_path return readgroup_path_dict else: logger.info('extracting readgroups from %s' % bam_path) bam_dir = os.path.dirname(bam_path) samfile = pysam.AlignmentFile(bam_path, 'rb') header = samfile.text header_list = header.split('\n') header_rg_list = [ header_line for header_line in header_list if header_line.startswith('@RG') ] readgroups = header_rg_list_to_rg_dicts(header_rg_list) readgroup_path_dict = dict() for readgroup in readgroups: rg_id = readgroup['ID'] outfile = rg_id + '.RG' outfile_path = os.path.join(bam_dir, outfile) readgroup_path_dict[rg_id] = outfile_path if pipe_util.already_step(bam_dir, readgroup['ID'] + '_rg_file', logger): logger.info('already wrote @RG to: %s' % outfile_path) else: outfile_open = open(outfile_path, 'w') outstring = '@RG' for rg_key in sorted(readgroup.keys()): outstring += '\\t' + rg_key + ':' + readgroup[rg_key] outfile_open.write(outstring) outfile_open.close() pipe_util.create_already_step(bam_dir, readgroup['ID'] + '_rg_file', logger) logger.info('readgroup_path_dict=%s' % readgroup_path_dict) pipe_util.create_already_step(step_dir, 'readgroups', logger) logger.info('completed extracting readgroups from %s' % bam_path) # Store @RG to db if pipe_util.already_step(step_dir, 'readgroups_db', logger): logger.info('already stored readgroups of %s to db' % bam_path) else: logger.info('storing readgroups of %s to db' % bam_path) for readgroup in readgroups: if pipe_util.already_step(bam_dir, readgroup['ID'] + '_rg_db', logger): logger.info('already wrote %s to db' % readgroup['ID']) else: readgroup['uuid'] = [uuid] table_name = 'readgroups' for rg_key in sorted(readgroup.keys()): rg_dict = dict() rg_dict['uuid'] = [uuid] rg_dict['ID'] = readgroup['ID'] rg_dict['value'] = readgroup[rg_key] df = pd.DataFrame(rg_dict) unique_key_dict = { 'uuid': uuid, 'ID': readgroup['ID'], 'key': rg_key } df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(bam_dir, readgroup['ID'] + '_rg_db', logger) logger.info('completed storing @RG %s to db' % readgroup['ID']) return readgroup_path_dict
def do_samtools_flagstat(uuid, bam_path, reference_fasta_path, engine, logger): step_dir = os.path.dirname(bam_path) bam_name = os.path.basename(bam_path) bam_base, bam_ext = os.path.splitext(bam_name) flagstat_outfile = 'samtools_flagstat_' + bam_base + '.txt' flagstat_path = os.path.join(step_dir, flagstat_outfile) if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base, logger): logger.info('already completed step `samtools flagstat of: %s' % bam_path) else: logger.info('running step stat of: %s' % bam_path) cmd = ['samtools', 'flagstat', bam_path] flagstat_output = pipe_util.do_command(cmd, logger) with open(flagstat_path, 'w') as flagstat_path_open: for aline in flagstat_output.decode().format(): flagstat_path_open.write(aline) #save time/mem to db df = time_util.store_time(uuid, cmd, flagstat_output, logger) df['bam_path'] = bam_path df['reference_fasta_path'] = reference_fasta_path table_name = 'time_mem_samtools_flagstat' unique_key_dict = { 'uuid': uuid, 'bam_path': bam_path, 'reference_fasta_path': reference_fasta_path } df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'samtools_flagstat_' + bam_base, logger) logger.info('completed running step `samtools flagstat` of: %s' % bam_path) #save stats to db if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base + '_db', logger): logger.info('already stored `samtools flagstat` of %s to db' % bam_path) else: data_dict = samtools_flagstat_to_dict(uuid, bam_path, flagstat_path, logger) data_dict['uuid'] = [uuid] data_dict['bam_path'] = bam_path data_dict['reference_fasta_path'] = reference_fasta_path df = pd.DataFrame(data_dict) table_name = 'samtools_flagstat' unique_key_dict = { 'uuid': uuid, 'bam_path': bam_path, 'reference_fasta_path': reference_fasta_path } df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'samtools_flagstat_' + bam_base + '_db', logger) logger.info('completed storing `samtools flagstat` of %s to db' % bam_path) return
def bwa_aln_single(uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, fastq_encoding, engine, logger): se_realn_dir = os.path.join(realn_dir, 'bwa_aln_' + readkey) logger.info('se_realln_dir=%s' % se_realn_dir) logger.info('read1=%s' % read1) fastqbasename = read1.replace('_' + readkey + '.fq', '') logger.info('fastqbasename=%s' % fastqbasename) outsai = os.path.basename(fastqbasename + '.sai') outbam = os.path.basename(fastqbasename + '.bam') outsai_path = os.path.join(se_realn_dir, outsai) outbam_path = os.path.join(se_realn_dir, outbam) read1_name, read1_ext = os.path.splitext(read1) sai1_name = read1_name + '.sai' sai1_path = os.path.join(se_realn_dir, sai1_name) f1 = os.path.join(fastq_dir, read1) os.makedirs(se_realn_dir, exist_ok=True) # BWA ALN Command if pipe_util.already_step(se_realn_dir, readkey + '_sai_' + fastqbasename, logger): logger.info('already completed step `bwa aln` of: %s' % read1) else: aln_frontend = ['bwa', 'aln', reference_fasta_path, f1] if fastq_encoding == 'Illumina-1.8' or fastq_encoding == 'Sanger / Illumina 1.9': logger.info('%s is fastq_encoding, so use `bwa aln`' % fastq_encoding) elif fastq_encoding == 'Illumina-1.3' or fastq_encoding == 'Illumina-1.5' or fastq_encoding == 'Illumina-1.5-HMS': logger.info('%s is fastq_encoding, so use `bwa aln -I`' % fastq_encoding) aln_frontend.insert(3, '-I') else: logger.info('unhandled fastq_encoding: %s' % fastq_encoding) sys.exit(1) aln_backend = [' > ', outsai_path] aln_cmd = aln_frontend + aln_backend shell_aln_cmd = ' '.join(aln_cmd) aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger) df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger) df['sai_path'] = outsai_path df['reference_fasta_path'] = reference_fasta_path # df['thread_count'] = thread_count unique_key_dict = { 'uuid': uuid, 'sai_path': outsai_path, 'reference_fasta_path': reference_fasta_path } # 'thread_count': thread_count} table_name = 'time_mem_bwa_aln' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step `bwa single aln` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + '_sai_' + fastqbasename, logger) # BWA SAMSE Command if pipe_util.already_step(se_realn_dir, readkey + '_samse_' + fastqbasename, logger): logger.info('already completed set `bwa samse` of %s:' % outbam_path) else: if rg_str is None: samse_cmd = [ 'bwa', 'samse', '-n 10', reference_fasta_path, outsai_path, f1 ] else: samse_cmd = [ 'bwa', 'samse', '-n 10', reference_fasta_path, '-r' + '"' + rg_str + '"', outsai_path, f1 ] samtools_cmd = 'samtools view -Shb -o ' + outbam_path + ' -' shell_samse_cmd = ' '.join(samse_cmd) shell_cmd = shell_samse_cmd + ' | ' + samtools_cmd logger.info('bwa_aln_single() shell_cmd=%s' % shell_cmd) samse_output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, samse_output, logger) logger.info('bwa_aln_single() df=%s' % df) df['bam_path'] = bam_path df['reference_fasta_path'] = reference_fasta_path unique_key_dict = { 'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path } table_name = 'time_mem_bwa_samse' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) logger.info('completed running step `bwa single samse` of: %s' % bam_path) pipe_util.create_already_step(se_realn_dir, readkey + '_samse_' + fastqbasename, logger) return outbam_path