def bammarkduplicates(uuid, bam_path, input_state, cpu_count, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) metrics_name = bam_name + ".metrics" tempfile = "tempfile" if pipe_util.already_step(work_dir, "md", logger): logger.info("already completed step `bammarkduplicates` of: %s" % bam_name) else: logger.info("running step `bammarkduplicates` of %s: " % bam_name) cmd = [ "bammarkduplicates", "I=" + bam_path, "O=" + bam_name, "M=" + metrics_name, "verbose=0", "level=-1", "index=1", "tmpfile=" + tempfile, "markthreads=" + str(cpu_count), ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df["bam_name"] = bam_name df["input_stage"] = input_stage unique_key_dict = {"uuid": uuid, "bam_name": bam_name} table_name = "time_mem_biobambam_bammarkduplicates" df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, "md", logger) logger.info("completed running step `bammarkduplicates` of: %s" % bam_name) return
def picard_sortvcf(case_id, vcf_path, output_vcf, reference_fasta_dict, engine, logger): files = vcf_path step_dir = os.getcwd() os.makedirs(step_dir, exist_ok=True) srt_vcf_path = os.path.join(step_dir, output_vcf) logger.info("picard_sortvcf_output=%s" % srt_vcf_path) mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") mem_gib = mem_bytes / (1024.0 ** 3) thread = int(multiprocessing.cpu_count()) - 2 java_heap = int(int(mem_gib) - thread / 2) if pipe_util.already_step(step_dir, case_id + "_sortvcf", logger): logger.info("already completed step `sortvcf` of: %s" % vcf_path) else: logger.info("running step `picard SortVcf` of: %s" % vcf_path) home_dir = os.path.join("/home", getpass.getuser()) cmd = [ "java", "-Djava.io.tmpdir=/tmp/job_tmp", "-XX:ParallelGCThreads=" + str(thread), "-Xmx" + str(java_heap) + "G", "-d64", "-jar", os.path.join(home_dir, "tools/picard-tools/picard.jar"), "SortVcf", "OUTPUT=" + srt_vcf_path, "SEQUENCE_DICTIONARY=" + reference_fasta_dict, ] for i in vcf_path: cmd.extend(["I=" + i]) output = pipe_util.do_command(cmd, logger) metrics = time_util.parse_time(output) met = SRT( case_id=case_id, tool="picard_sortvcf", files=files, systime=metrics["system_time"], usertime=metrics["user_time"], elapsed=metrics["wall_clock"], cpu=metrics["percent_of_cpu"], max_resident_time=metrics["maximum_resident_set_size"], ) postgres.create_table(engine, met) postgres.add_metrics(engine, met) pipe_util.create_already_step(step_dir, case_id + "_SortVcf", logger) logger.info("completed running step sortvcf of: %s" % vcf_path) return srt_vcf_path
def splitbam(uuid, bam_path, engine, logger): step_dir = os.getcwd() if pipe_util.already_step(step_dir, 'splitbam', logger): logger.info('already completed step `splitbam` of: %s' % bam_path) else: logger.info('running step `bamtofastq` of %s: ' % bam_path) log_path = 'listFile.log' out_path = 'split' cmd = ['bam', 'splitBam', '--in', bam_path, '--out', out_path, '--log', log_path ] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_path'] = bam_path unique_key_dict = {'uuid': uuid, 'bam_path': bam_path} table_name = 'time_mem_bamutil_splitbam' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'splitbam', logger) logger.info('completed running step `splitbam` of: %s' % bam_path) return
def fastqc(uuid, fastq_path, thread_count, engine, logger): fastq_name = os.path.basename(fastq_path) step_dir = os.getcwd() fastq_base, fastq_ext = os.path.splitext(fastq_name) if pipe_util.already_step(step_dir, 'fastqc_' + fastq_base, logger): logger.info('already completed step `fastqc`: %s' % fastq_path) else: logger.info('running step `fastqc` of %s' % fastq_path) home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit fastqc_path = os.path.join(home_dir, 'tools', 'FastQC', 'fastqc') cmd = [fastqc_path, '--threads', str(thread_count), '--noextract', fastq_path, '--outdir', step_dir, '--dir', step_dir] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['fastq_name'] = fastq_name table_name = 'time_mem_fastqc' unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'fastqc_' + fastq_base, logger) return
def bamvalidate(uuid, bam_path, input_state, cpu_count, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) tmpfile = 'tmpfile' inputthreads = str(cpu_count/2) outputthreads = inputthreads if pipe_util.already_step(step_dir, bam_name + '_bamvalidate', logger): logger.info('already completed step `bamvalidate` of: %s' % bam_name) else: logger.info('running step `picard BuildBamValidate` of: %s' % bam_name) cmd = ['bamvalidate', 'verbose=1', 'I=' + bam_path, 'tmpfile=' + tmpfile, 'inputthreads='+inputthreads, 'outputthreads='+outputthreads] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_name'] = bam_name unique_key_dict = {'uuid': uuid, 'bam_name': bam_name} table_name = 'time_mem_biobambam_bamvalidate' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_bamvalidate', logger) logger.info('completed running step `bamvalidate` of: %s' % bam_name) return
def bamindex(uuid, bam_path, input_state, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) bam_base, bam_ext = os.path.splitext(bam_name) bai_name = bam_base + '.bai' if pipe_util.already_step(step_dir, bam_name + '_bamindex', logger): logger.info('already completed step `bamindex` of: %s' % bam_name) else: logger.info('running step `picard BuildBamIndex` of: %s' % bam_name) cmd = ['bamindex', 'verbose=0', 'disablevalidation=1', 'I=' + bam_path, 'O=' + bai_name] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_name'] = bam_name df['input_state'] = input_state unique_key_dict = {'uuid': uuid, 'bam_name': bam_name} table_name = 'time_mem_biobambam_bamindex' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, bam_name + '_bamindex', logger) logger.info('completed running step `bamindex` of: %s' % bam_name) return
def index(uuid, cram_path, reference_fasta_path, engine, logger): step_dir = os.getcwd() cram_name = os.path.basename(cram_path) output_bai = os.path.join(step_dir, cram_name) + '.bai' if pipe_util.already_step(step_dir, uuid + 'cram index', logger): logger.info('already completed step `cram index` of: %s' % cram_path) else: logger.info('running step `cram index` of: %s' % cram_path) home_dir = os.path.expanduser('~') cramtools_path = os.path.join(home_dir, 'tools/cramtools-3.0.jar') cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', cramtools_path, 'index', '--bam-style-index', '--input-file', cram_path, '--reference-fasta-file', reference_fasta_path] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['cram_path'] = cram_path df['output_bai'] = output_bai table_name = 'time_mem_cram_bam_style_index' unique_key_dict = {'uuid': uuid, 'cram_path': cram_path, 'output_bai': output_bai} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_cram index', logger) logger.info('completed running step `cram index` of: %s' % cram_path) return
def combinevcf(uuid, vcf_path_list, reference_fasta_path, thread_count, engine, logger): step_dir = os.getcwd() output_pon_vcf = os.path.join(step_dir, uuid) + '_PON.vcf' if pipe_util.already_step(step_dir, uuid + '_CombineVariants', logger): logger.info('already completed step `CombineVariants` of: %s' % vcf_path_list) else: logger.info('running step `CombineVariants` of: %s' % vcf_path_list) home_dir = os.path.expanduser('~') gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar') cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', gatk_path, '-T', 'CombineVariants', '-nt', str(thread_count), '-R', reference_fasta_path, '-minN 2', '--setKey "null"', '--filteredAreUncalled', '--filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED', '-o', output_pon_vcf] for vcf_path in vcf_path_list: cmd.extend(['-V', vcf_path]) output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['output_pon_vcf'] = output_pon_vcf table_name = 'time_mem_gatk_CombineVariants' unique_key_dict = {'uuid': uuid, 'output_pon_vcf': output_pon_vcf} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, uuid + '_CombineVariants', logger) logger.info('completed running step `CombineVariants` of: %s' % vcf_path_list) return
def bamtofastq(uuid, bam_path, input_state, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) if pipe_util.already_step(step_dir, 'fastq', logger): logger.info('already completed step `bamtofastq` of: %s' % bam_name) else: logger.info('running step `bamtofastq` of %s: ' % bam_name) home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit tempfq = os.path.join(step_dir, 'tempfq') bin_path = os.path.join(home_dir, 'tools', 'biobambam2', 'bin', 'bamtofastq') cmd = [bin_path, 'filename=' + bam_path, 'outputdir=' + step_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=' + tempfq, 'gz=1', 'level=1', 'outputperreadgroupsuffixF=_1.fq.gz', 'outputperreadgroupsuffixF2=_2.fq.gz', 'outputperreadgroupsuffixO=_o1.fq.gz', 'outputperreadgroupsuffixO2=_o2.fq.gz', 'outputperreadgroupsuffixS=_s.fq.gz', 'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY'] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_name'] = bam_name df['input_state'] = input_state unique_key_dict = {'uuid': uuid, 'bam_name': bam_name} table_name = 'time_mem_biobambam_bamtofastq' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'fastq', logger) logger.info('completed running step `bamtofastq` of: %s' % bam_name) return
def bammarkduplicates2(uuid, bam_path, input_state, cpu_count, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) metrics_name = bam_name+'.metrics' tempfile = 'tempfile' logger.info('work_dir is: %s' % work_dir) if pipe_util.already_step(step_dir, 'md2', logger): logger.info('already completed step `bammarkduplicates2` of: %s' % bam_name) else: logger.info('running step `bammarkduplicates2` of %s: ' % bam_name) cmd = ['bammarkduplicates2', 'I=' + bam_path, 'O=' + bam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'markthreads='+str(cpu_count)] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['bam_name'] = bam_name df['input_stage'] = input_stage unique_key_dict = {'uuid': uuid, 'bam_name': bam_name} table_name = 'time_mem_biobambam_bammarkduplicates2' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'md2', logger) logger.info('completed running step `bammarkduplicates2` of: %s' % bam_name) return
def bammerge(uuid, outbam_name, bam_path_list, reference_fasta_path, cpu_count, engine, logger): step_dir = os.getcwd() bam_name = os.path.basename(bam_path) metrics_name = outbam_name+'.metrics' tempfile = 'tempfile' if pipe_util.already_step(step_dir, 'merge', logger): logger.info('already completed step `bammerge` of: %s' % outbam_name) else: logger.info('running step `bammerge` of %s: ' % bam_path) for input_bam in bam_path_list: input_string = 'I=' + input_bam cmd = ['bammerge', 'O=' + outbam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'SO=coordinate', input_string] output = pipe_util.do_command(cmd, logger) df = time_util.store_time(uuid, cmd, output, logger) df['outbam_name'] = outbam_name unique_key_dict = {'uuid': uuid, 'outbam_name': outbam_name} table_name = 'time_mem_biobambam_bammerge' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'merge', logger) logger.info('completed running step `bammerge` of: %s' % outbam_name) return
def trimmomatic(uuid, fastq_dir, adapter_pickle_path, thread_count, engine, logger): logger.info() fastq_list = buildfastqlist(fastq_dir) logging.info('fastqlist=%s' % fastq_list) pefastqdict = fastq_util.buildpefastqdict(fastq_list) logger.info('pefastqdict=%s' % pefastqdict) sefastqlist = fastq_util.buildsefastqlist(fastq_list) logger.info('sefastqlist=%s' % sefastqlist) o1fastqlist = fastq_util.buildo1fastqlist(fastq_list) logger.info('o1fastqlist=%s' % o1fastqlist) o2fastqlist = fastq_util.buildo2fastqlist(fastq_list) logger.info('o2fastqlist=%s' % o2fastqlist) trimmomatic_dir = os.path.join(fastq_dir,'trimmomatic') step_dir=trimmomatic_dir home_dir = os.path.expanduser('~') os.makedirs(trimmomatic_dir, exist_ok=True) for read1 in sorted(pefastqdict.keys()): read1_name, read1_ext = os.path.splitext(read1) fq1_in_path = os.path.join(fastq_dir, read1) fq2_in_path = os.path.join(fastq_dir, pefastqdict[read1]) fq1_out_path = os.path.join(trimmomatic_dir, read1) fq1_unpaired_path = fq1_out_path + 'UP' fq2_out_path = os.path.join(trimmomatic_dir, pefastqdict[read1]) fq2_unpaired_path = fq2_out_path + 'UP' diff1_path = fq1_out_path + '.diff' diff2_path = fq2_out_path + '.diff' diff1_xz_path = diff1_path + '.xz' diff2_xz_path = diff2_path + '.xz' diff1_name = os.path.basename(diff1_path) diff2_name = os.path.basename(diff2_path) fastq_type='PE' #generate trim if pipe_util.already_step(step_dir, 'trim_pe_' + read1_name, logger): logger.info('already completed pe trim on %s' % read1) else: logger.info('running step PE `trimmomatic` of: %s' % read1) trimmomatic_path = os.path.join(home_dir,'tools','trimmomatic','dist','jar','trimmomatic.jar') cmd = ['java', '-jar', trimmomatic_path, 'PE', '-threads', thread_count, '-phred33', fq1_in_path, fq2_in_path, fq1_out_path, fq1_unpaired_path, fq2_out_path, fq2_unpaired_path, 'ILLUMINACLIP:' + adapter_path] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = read1 unique_key_dict = {'uuid': uuid, 'fastq': read1} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_' + read1_name, logger) logger.info('completed step PE `trimmomatic` of: %s' % read1) #generate diff if pipe_util.already_step(step_dir, 'trim_pe_diff_' + read1_name, logger): logger.info('already generated diff of trimmomatic of %s' % read1_name) else: logger.info('generating PE diff of trimmomatic of %s' % read1_name) cmd1 = ['diff', '-u', fq1_out_path, fq1_in_path, '>', diff1_path] cmd2 = ['diff', '-u', fq2_out_path, fq2_in_path, '>', diff2_path] shell_cmd1 = ' '.join(cmd1) shell_cmd1 = ' '.join(cmd2) output1 = pipe_util.do_shell_command(shell_cmd1, logger) output2 = pipe_util.do_shell_command(shell_cmd2, logger) #save time/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name df1['fastq_type'] = fastq_type df2['fastq_type'] = fastq_type unique_key_dict1 = {'uuid': uuid, 'diff':diff1_name, 'fastq_type': fastq_type} unique_key_dict2 = {'uuid': uuid, 'diff':diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_trimmomatic_diff' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_diff_' + read1_name, logger) logger.info('completed generating PE diff of trimmomatic of %s' % read1_name) #generate diff stats if pipe_util.already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger): logger.info('already completed step `summary stats of diff` of %s' % read1_name) else: logger.info('running step PE `summary of diff` of %s' % read1_name) trimmomatic_summ_met_dir = os.path.dirname(os.path.realpath(__file__)) trimmomatic_summ_met_path = os.path.join(trimmomatic_summ_met_dir, 'trimmomatic_summary_metrics_from_diff.py') cmd1 = [trimmomatic_summ_met_path, '-d', diff1_path] cmd2 = [trimmomatic_summ_met_path, '-d', diff2_path] output1 = pipe_util.do_command(cmd1, logger) output2 = pipe_util.do_command(cmd2, logger) #save time/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type} unique_key_dict1 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_trimmomatic_diff_summary' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger) #save stats to db if pipe_util.already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger): logger.info('already stored PE `trimmomatic` of %s to db' % read1) else: logger.info('storing `trimmomatic` of %s to db' % read1) df = trimmomatic_diff_summary_to_df(uuid, read1, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_diff_summary' unique_key_dict1 = {'uuid': uuid, 'fastq': read1_name} unique_key_dict2 = {'uuid': uuid, 'fastq': read2_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger) logger.info('completed storing PE `trimmomatic` of %s to db' % read1) #compress diff if pipe_util.already_step(step_dir, 'xz_pe_diff_' + read1_name, logger): logger.info('already compressed PE diff: %s' % diff1_name) else: logger.info('compressing PE diff: %s' % diff1_name) cmd1 = ['xz', '-9', diff1_path] cmd2 = ['xz', '-9', diff2_path] output1 = pipe_util.do_command(cmd1, logger) output2 = pipe_util.do_command(cmd2, logger) #save timem/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type} unique_key_dict2 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_diff_xz' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'xz_pe_diff_' + read1_name, logger) logger.info('completed compressing PE diff: %s' % diff1_name) for seread in sefastqlist: read_name, read_ext = os.path.splitext(seread) if pipe_util.already_step(step_dir, 'trim_se_' + read_name, logger): logger.info('already completed se trim on %s' % seread) else: logger.info('running step SE `trimmomatic` of: %s' % seread) fq_in_path = os.path.join(fastq_dir, seread) fq_out_path = os.path.join(trimmomatic_dir, seread) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = seread unique_key_dict = {'uuid': uuid, 'fastq': seread} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path) #save stats to db if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % seread) else: logger.info('storing `trimmomatic` of %s to db' % seread) df = trimmomatic_log_to_df(uuid, seread, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': seread} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % seread) for o1read in o1fastqlist: read_name, read_ext = os.path.splitext(o1read) if pipe_util.already_step(step_dir, 'trim_o1_' + read_name, logger): logger.info('already completed se trim on %s' % o1read) else: logger.info('running step SE `trimmomatic` of: %s' % o1read) fq_in_path = os.path.join(fastq_dir, o1read) fq_out_path = os.path.join(trimmomatic_dir, o1read) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = o1read unique_key_dict = {'uuid': uuid, 'fastq': o1read} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path) #save stats to db if pipe_util.already_step(step_dir, 'trim_o1_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % o1read) else: logger.info('storing `trimmomatic` of %s to db' % o1read) df = trimmomatic_log_to_df(uuid, o1read, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': o1read} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % o1read) for o2read in o2fastqlist: read_name, read_ext = os.path.splitext(o2read) if pipe_util.already_step(step_dir, 'trim_o2_' + read_name, logger): logger.info('already completed se trim on %s' % o2read) else: fq_in_path = os.path.join(fastq_dir, o2read) fq_out_path = os.path.join(trimmomatic_dir, o2read) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = o2read unique_key_dict = {'uuid': uuid, 'fastq': o2read} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o2_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % o2read) #save stats to db if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % o2read) else: logger.info('storing `trimmomatic` of %s to db' % o2read) df = trimmomatic_log_to_df(uuid, o2read, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': o2read} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % o2read)