def bammarkduplicates(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = bam_name + ".metrics"
    tempfile = "tempfile"
    if pipe_util.already_step(work_dir, "md", logger):
        logger.info("already completed step `bammarkduplicates` of: %s" % bam_name)
    else:
        logger.info("running step `bammarkduplicates` of %s: " % bam_name)
        cmd = [
            "bammarkduplicates",
            "I=" + bam_path,
            "O=" + bam_name,
            "M=" + metrics_name,
            "verbose=0",
            "level=-1",
            "index=1",
            "tmpfile=" + tempfile,
            "markthreads=" + str(cpu_count),
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_name"] = bam_name
        df["input_stage"] = input_stage
        unique_key_dict = {"uuid": uuid, "bam_name": bam_name}
        table_name = "time_mem_biobambam_bammarkduplicates"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, "md", logger)
        logger.info("completed running step `bammarkduplicates` of: %s" % bam_name)
    return
def picard_sortvcf(case_id, vcf_path, output_vcf, reference_fasta_dict, engine, logger):
    files = vcf_path
    step_dir = os.getcwd()
    os.makedirs(step_dir, exist_ok=True)
    srt_vcf_path = os.path.join(step_dir, output_vcf)
    logger.info("picard_sortvcf_output=%s" % srt_vcf_path)
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
    mem_gib = mem_bytes / (1024.0 ** 3)
    thread = int(multiprocessing.cpu_count()) - 2
    java_heap = int(int(mem_gib) - thread / 2)
    if pipe_util.already_step(step_dir, case_id + "_sortvcf", logger):
        logger.info("already completed step `sortvcf` of: %s" % vcf_path)
    else:
        logger.info("running step `picard SortVcf` of: %s" % vcf_path)
        home_dir = os.path.join("/home", getpass.getuser())
        cmd = [
            "java",
            "-Djava.io.tmpdir=/tmp/job_tmp",
            "-XX:ParallelGCThreads=" + str(thread),
            "-Xmx" + str(java_heap) + "G",
            "-d64",
            "-jar",
            os.path.join(home_dir, "tools/picard-tools/picard.jar"),
            "SortVcf",
            "OUTPUT=" + srt_vcf_path,
            "SEQUENCE_DICTIONARY=" + reference_fasta_dict,
        ]
        for i in vcf_path:
            cmd.extend(["I=" + i])
        output = pipe_util.do_command(cmd, logger)
        metrics = time_util.parse_time(output)
        met = SRT(
            case_id=case_id,
            tool="picard_sortvcf",
            files=files,
            systime=metrics["system_time"],
            usertime=metrics["user_time"],
            elapsed=metrics["wall_clock"],
            cpu=metrics["percent_of_cpu"],
            max_resident_time=metrics["maximum_resident_set_size"],
        )
        postgres.create_table(engine, met)
        postgres.add_metrics(engine, met)
        pipe_util.create_already_step(step_dir, case_id + "_SortVcf", logger)
        logger.info("completed running step sortvcf of: %s" % vcf_path)
    return srt_vcf_path
def splitbam(uuid, bam_path, engine, logger):
    step_dir = os.getcwd()
    if pipe_util.already_step(step_dir, 'splitbam', logger):
        logger.info('already completed step `splitbam` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        log_path = 'listFile.log'
        out_path = 'split'
        cmd = ['bam', 'splitBam', '--in', bam_path, '--out', out_path, '--log', log_path ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_bamutil_splitbam'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'splitbam', logger)
        logger.info('completed running step `splitbam` of: %s' % bam_path)
    return
Exemple #4
0
def fastqc(uuid, fastq_path, thread_count, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    step_dir = os.getcwd()
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(step_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc` of %s' % fastq_path)
        home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit
        fastqc_path = os.path.join(home_dir, 'tools', 'FastQC', 'fastqc')

        cmd = [fastqc_path, '--threads', str(thread_count), '--noextract', fastq_path, '--outdir', step_dir, '--dir', step_dir]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_name'] = fastq_name
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'fastqc_' + fastq_base, logger)
    return
def bamvalidate(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    tmpfile = 'tmpfile'
    inputthreads = str(cpu_count/2)
    outputthreads = inputthreads
    if pipe_util.already_step(step_dir, bam_name + '_bamvalidate', logger):
        logger.info('already completed step `bamvalidate` of: %s' % bam_name)
    else:
        logger.info('running step `picard BuildBamValidate` of: %s' % bam_name)
        cmd = ['bamvalidate', 'verbose=1', 'I=' + bam_path, 'tmpfile=' + tmpfile, 'inputthreads='+inputthreads, 'outputthreads='+outputthreads]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamvalidate'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_bamvalidate', logger)
        logger.info('completed running step `bamvalidate` of: %s' % bam_name)
    return
def bamindex(uuid, bam_path, input_state, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    bam_base, bam_ext = os.path.splitext(bam_name)
    bai_name = bam_base + '.bai'
    if pipe_util.already_step(step_dir, bam_name + '_bamindex', logger):
        logger.info('already completed step `bamindex` of: %s' % bam_name)
    else:
        logger.info('running step `picard BuildBamIndex` of: %s' % bam_name)
        cmd = ['bamindex', 'verbose=0', 'disablevalidation=1', 'I=' + bam_path, 'O=' + bai_name]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_state'] = input_state
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamindex'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_bamindex', logger)
        logger.info('completed running step `bamindex` of: %s' % bam_name)
    return
Exemple #7
0
def index(uuid, cram_path, reference_fasta_path, engine, logger):
  step_dir = os.getcwd()
  cram_name = os.path.basename(cram_path)
  output_bai = os.path.join(step_dir, cram_name) + '.bai'
  if pipe_util.already_step(step_dir, uuid + 'cram index', logger):
    logger.info('already completed step `cram index` of: %s' % cram_path)
  else:
    logger.info('running step `cram index` of: %s' % cram_path)
    home_dir = os.path.expanduser('~')
    cramtools_path = os.path.join(home_dir, 'tools/cramtools-3.0.jar')
    cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', cramtools_path, 'index', '--bam-style-index', '--input-file', cram_path, '--reference-fasta-file', reference_fasta_path]
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['cram_path'] = cram_path
    df['output_bai'] = output_bai
    table_name = 'time_mem_cram_bam_style_index'
    unique_key_dict = {'uuid': uuid, 'cram_path': cram_path, 'output_bai': output_bai}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_cram index', logger)
    logger.info('completed running step `cram index` of: %s' % cram_path)
  return
def combinevcf(uuid, vcf_path_list, reference_fasta_path, thread_count, engine, logger):
  step_dir = os.getcwd()
  output_pon_vcf = os.path.join(step_dir, uuid) + '_PON.vcf'
  if pipe_util.already_step(step_dir, uuid + '_CombineVariants', logger):
    logger.info('already completed step `CombineVariants` of: %s' % vcf_path_list)
  else:
    logger.info('running step `CombineVariants` of: %s' % vcf_path_list)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', gatk_path, '-T', 'CombineVariants', '-nt', str(thread_count), '-R', reference_fasta_path, '-minN 2', '--setKey "null"', '--filteredAreUncalled', '--filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED', '-o', output_pon_vcf]
    for vcf_path in vcf_path_list:
      cmd.extend(['-V', vcf_path])
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['output_pon_vcf'] = output_pon_vcf
    table_name = 'time_mem_gatk_CombineVariants'
    unique_key_dict = {'uuid': uuid, 'output_pon_vcf': output_pon_vcf}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_CombineVariants', logger)
    logger.info('completed running step `CombineVariants` of: %s' % vcf_path_list)
  return
def bamtofastq(uuid, bam_path, input_state, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    if pipe_util.already_step(step_dir, 'fastq', logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_name)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_name)
        home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit
        tempfq = os.path.join(step_dir, 'tempfq')
        bin_path = os.path.join(home_dir, 'tools', 'biobambam2', 'bin', 'bamtofastq')
        cmd = [bin_path, 'filename=' + bam_path, 'outputdir=' + step_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=' + tempfq, 'gz=1', 'level=1', 'outputperreadgroupsuffixF=_1.fq.gz', 'outputperreadgroupsuffixF2=_2.fq.gz', 'outputperreadgroupsuffixO=_o1.fq.gz', 'outputperreadgroupsuffixO2=_o2.fq.gz', 'outputperreadgroupsuffixS=_s.fq.gz', 'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY']
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_state'] = input_state
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamtofastq'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'fastq', logger)
        logger.info('completed running step `bamtofastq` of: %s' % bam_name)
    return
def bammarkduplicates2(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = bam_name+'.metrics'
    tempfile = 'tempfile'
    logger.info('work_dir is: %s' % work_dir)
    if pipe_util.already_step(step_dir, 'md2', logger):
        logger.info('already completed step `bammarkduplicates2` of: %s' % bam_name)
    else:
        logger.info('running step `bammarkduplicates2` of %s: ' % bam_name)
        cmd = ['bammarkduplicates2', 'I=' + bam_path, 'O=' + bam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'markthreads='+str(cpu_count)]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_stage'] = input_stage
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bammarkduplicates2'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'md2', logger)
        logger.info('completed running step `bammarkduplicates2` of: %s' % bam_name)
    return
def bammerge(uuid, outbam_name, bam_path_list, reference_fasta_path, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = outbam_name+'.metrics'
    tempfile = 'tempfile'
    if pipe_util.already_step(step_dir, 'merge', logger):
        logger.info('already completed step `bammerge` of: %s' % outbam_name)
    else:
        logger.info('running step `bammerge` of %s: ' % bam_path)
        for input_bam in bam_path_list:
            input_string = 'I=' + input_bam
        cmd = ['bammerge', 'O=' + outbam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'SO=coordinate', input_string]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['outbam_name'] = outbam_name
        unique_key_dict = {'uuid': uuid, 'outbam_name': outbam_name}
        table_name = 'time_mem_biobambam_bammerge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'merge', logger)
        logger.info('completed running step `bammerge` of: %s' % outbam_name)
    return
def trimmomatic(uuid, fastq_dir, adapter_pickle_path, thread_count, engine, logger):
    logger.info()
    fastq_list = buildfastqlist(fastq_dir)
    logging.info('fastqlist=%s' % fastq_list)
    pefastqdict = fastq_util.buildpefastqdict(fastq_list)
    logger.info('pefastqdict=%s' % pefastqdict)
    sefastqlist = fastq_util.buildsefastqlist(fastq_list)
    logger.info('sefastqlist=%s' % sefastqlist)
    o1fastqlist = fastq_util.buildo1fastqlist(fastq_list)
    logger.info('o1fastqlist=%s' % o1fastqlist)
    o2fastqlist = fastq_util.buildo2fastqlist(fastq_list)
    logger.info('o2fastqlist=%s' % o2fastqlist)
    trimmomatic_dir = os.path.join(fastq_dir,'trimmomatic')
    step_dir=trimmomatic_dir

    home_dir = os.path.expanduser('~')
    os.makedirs(trimmomatic_dir, exist_ok=True)
    for read1 in sorted(pefastqdict.keys()):
        read1_name, read1_ext = os.path.splitext(read1)
        fq1_in_path = os.path.join(fastq_dir, read1)
        fq2_in_path = os.path.join(fastq_dir, pefastqdict[read1])
        fq1_out_path = os.path.join(trimmomatic_dir, read1)
        fq1_unpaired_path = fq1_out_path + 'UP'
        fq2_out_path = os.path.join(trimmomatic_dir, pefastqdict[read1])
        fq2_unpaired_path = fq2_out_path + 'UP'
        diff1_path = fq1_out_path + '.diff'
        diff2_path = fq2_out_path + '.diff'
        diff1_xz_path = diff1_path + '.xz'
        diff2_xz_path = diff2_path + '.xz'
        diff1_name = os.path.basename(diff1_path)
        diff2_name = os.path.basename(diff2_path)
        fastq_type='PE'
        #generate trim
        if pipe_util.already_step(step_dir, 'trim_pe_' + read1_name, logger):
            logger.info('already completed pe trim on %s' % read1)
        else:
            logger.info('running step PE `trimmomatic` of: %s' % read1)
            trimmomatic_path = os.path.join(home_dir,'tools','trimmomatic','dist','jar','trimmomatic.jar')
            cmd = ['java', '-jar', trimmomatic_path, 'PE', '-threads', thread_count, '-phred33',
                   fq1_in_path, fq2_in_path, fq1_out_path, fq1_unpaired_path,
                   fq2_out_path, fq2_unpaired_path, 'ILLUMINACLIP:' + adapter_path]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = read1
            unique_key_dict = {'uuid': uuid, 'fastq': read1}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_' + read1_name, logger)
            logger.info('completed step PE `trimmomatic` of: %s' % read1)
            
        #generate diff
        if pipe_util.already_step(step_dir, 'trim_pe_diff_' + read1_name, logger):
            logger.info('already generated diff of trimmomatic of %s' % read1_name)
        else:
            logger.info('generating PE diff of trimmomatic of %s' % read1_name)
            cmd1 = ['diff', '-u', fq1_out_path, fq1_in_path, '>', diff1_path]
            cmd2 = ['diff', '-u', fq2_out_path, fq2_in_path, '>', diff2_path]
            shell_cmd1 = ' '.join(cmd1)
            shell_cmd1 = ' '.join(cmd2)
            output1 = pipe_util.do_shell_command(shell_cmd1, logger)
            output2 = pipe_util.do_shell_command(shell_cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            df1['fastq_type'] = fastq_type
            df2['fastq_type'] = fastq_type
            unique_key_dict1 = {'uuid': uuid, 'diff':diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff':diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_diff_' + read1_name, logger)
            logger.info('completed generating PE diff of trimmomatic of %s' % read1_name)
            
        #generate diff stats
        if pipe_util.already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger):
            logger.info('already completed step `summary stats of diff` of %s' % read1_name)
        else:
            logger.info('running step PE `summary of diff` of %s' % read1_name)
            trimmomatic_summ_met_dir = os.path.dirname(os.path.realpath(__file__))
            trimmomatic_summ_met_path = os.path.join(trimmomatic_summ_met_dir, 'trimmomatic_summary_metrics_from_diff.py')
            cmd1 = [trimmomatic_summ_met_path, '-d', diff1_path]
            cmd2 = [trimmomatic_summ_met_path, '-d', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict1 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff_summary'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger)
            
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger):
            logger.info('already stored PE `trimmomatic` of %s to db' % read1)
        else:
            logger.info('storing `trimmomatic` of %s to db' % read1)
            df = trimmomatic_diff_summary_to_df(uuid, read1, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_diff_summary'
            unique_key_dict1 = {'uuid': uuid, 'fastq': read1_name}
            unique_key_dict2 = {'uuid': uuid, 'fastq': read2_name}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger)
            logger.info('completed storing PE `trimmomatic` of %s to db' % read1)

        #compress diff
        if pipe_util.already_step(step_dir, 'xz_pe_diff_' + read1_name, logger):
            logger.info('already compressed PE diff: %s' % diff1_name)
        else:
            logger.info('compressing PE diff: %s' % diff1_name)
            cmd1 = ['xz', '-9', diff1_path]
            cmd2 = ['xz', '-9', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save timem/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_diff_xz'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'xz_pe_diff_' + read1_name, logger)
            logger.info('completed compressing PE diff: %s' % diff1_name)
            
    for seread in sefastqlist:
        read_name, read_ext = os.path.splitext(seread)
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name, logger):
            logger.info('already completed se trim on %s' % seread)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % seread)
            fq_in_path = os.path.join(fastq_dir, seread)
            fq_out_path = os.path.join(trimmomatic_dir, seread)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', 
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = seread
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % seread)
        else:
            logger.info('storing `trimmomatic` of %s to db' % seread)
            df = trimmomatic_log_to_df(uuid, seread, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % seread)
            
    for o1read in o1fastqlist:
        read_name, read_ext = os.path.splitext(o1read)
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name, logger):
            logger.info('already completed se trim on %s' % o1read)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % o1read)
            fq_in_path = os.path.join(fastq_dir, o1read)
            fq_out_path = os.path.join(trimmomatic_dir, o1read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o1read
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o1read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o1read)
            df = trimmomatic_log_to_df(uuid, o1read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o1read)

            
    for o2read in o2fastqlist:
        read_name, read_ext = os.path.splitext(o2read)
        if pipe_util.already_step(step_dir, 'trim_o2_' + read_name, logger):
            logger.info('already completed se trim on %s' % o2read)
        else:
            fq_in_path = os.path.join(fastq_dir, o2read)
            fq_out_path = os.path.join(trimmomatic_dir, o2read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o2read
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o2_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % o2read)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o2read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o2read)
            df = trimmomatic_log_to_df(uuid, o2read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o2read)