def bammarkduplicates(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = bam_name + ".metrics"
    tempfile = "tempfile"
    if pipe_util.already_step(work_dir, "md", logger):
        logger.info("already completed step `bammarkduplicates` of: %s" % bam_name)
    else:
        logger.info("running step `bammarkduplicates` of %s: " % bam_name)
        cmd = [
            "bammarkduplicates",
            "I=" + bam_path,
            "O=" + bam_name,
            "M=" + metrics_name,
            "verbose=0",
            "level=-1",
            "index=1",
            "tmpfile=" + tempfile,
            "markthreads=" + str(cpu_count),
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_name"] = bam_name
        df["input_stage"] = input_stage
        unique_key_dict = {"uuid": uuid, "bam_name": bam_name}
        table_name = "time_mem_biobambam_bammarkduplicates"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, "md", logger)
        logger.info("completed running step `bammarkduplicates` of: %s" % bam_name)
    return
def do_pool_commands(cmd, uuid, engine, logger, lock = Lock()):
    logger.info('running mutect2_pon chunk call: %s' % cmd)
    output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output_stdout = output.communicate()[1]
    with lock:
        logger.info('contents of output=%s' % output_stdout.decode().format())
        df = time_util.store_time(uuid, cmd, output_stdout, logger)
        df['cmd'] = cmd
        unique_key_dict = {'uuid': uuid, 'cmd': cmd}
        table_name = 'time_mem_MuTect2_Panel_Of_Normal_chunk_call_processes'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed mutect2_pon chunk call: %s' % str(cmd))
    return output.wait()
def splitbam(uuid, bam_path, engine, logger):
    step_dir = os.getcwd()
    if pipe_util.already_step(step_dir, 'splitbam', logger):
        logger.info('already completed step `splitbam` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        log_path = 'listFile.log'
        out_path = 'split'
        cmd = ['bam', 'splitBam', '--in', bam_path, '--out', out_path, '--log', log_path ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_bamutil_splitbam'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'splitbam', logger)
        logger.info('completed running step `splitbam` of: %s' % bam_path)
    return
Beispiel #4
0
def fastqc(uuid, fastq_path, thread_count, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    step_dir = os.getcwd()
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(step_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc` of %s' % fastq_path)
        home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit
        fastqc_path = os.path.join(home_dir, 'tools', 'FastQC', 'fastqc')

        cmd = [fastqc_path, '--threads', str(thread_count), '--noextract', fastq_path, '--outdir', step_dir, '--dir', step_dir]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_name'] = fastq_name
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'fastqc_' + fastq_base, logger)
    return
def bamvalidate(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    tmpfile = 'tmpfile'
    inputthreads = str(cpu_count/2)
    outputthreads = inputthreads
    if pipe_util.already_step(step_dir, bam_name + '_bamvalidate', logger):
        logger.info('already completed step `bamvalidate` of: %s' % bam_name)
    else:
        logger.info('running step `picard BuildBamValidate` of: %s' % bam_name)
        cmd = ['bamvalidate', 'verbose=1', 'I=' + bam_path, 'tmpfile=' + tmpfile, 'inputthreads='+inputthreads, 'outputthreads='+outputthreads]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamvalidate'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_bamvalidate', logger)
        logger.info('completed running step `bamvalidate` of: %s' % bam_name)
    return
def bamindex(uuid, bam_path, input_state, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    bam_base, bam_ext = os.path.splitext(bam_name)
    bai_name = bam_base + '.bai'
    if pipe_util.already_step(step_dir, bam_name + '_bamindex', logger):
        logger.info('already completed step `bamindex` of: %s' % bam_name)
    else:
        logger.info('running step `picard BuildBamIndex` of: %s' % bam_name)
        cmd = ['bamindex', 'verbose=0', 'disablevalidation=1', 'I=' + bam_path, 'O=' + bai_name]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_state'] = input_state
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamindex'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_bamindex', logger)
        logger.info('completed running step `bamindex` of: %s' % bam_name)
    return
Beispiel #7
0
def index(uuid, cram_path, reference_fasta_path, engine, logger):
  step_dir = os.getcwd()
  cram_name = os.path.basename(cram_path)
  output_bai = os.path.join(step_dir, cram_name) + '.bai'
  if pipe_util.already_step(step_dir, uuid + 'cram index', logger):
    logger.info('already completed step `cram index` of: %s' % cram_path)
  else:
    logger.info('running step `cram index` of: %s' % cram_path)
    home_dir = os.path.expanduser('~')
    cramtools_path = os.path.join(home_dir, 'tools/cramtools-3.0.jar')
    cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', cramtools_path, 'index', '--bam-style-index', '--input-file', cram_path, '--reference-fasta-file', reference_fasta_path]
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['cram_path'] = cram_path
    df['output_bai'] = output_bai
    table_name = 'time_mem_cram_bam_style_index'
    unique_key_dict = {'uuid': uuid, 'cram_path': cram_path, 'output_bai': output_bai}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_cram index', logger)
    logger.info('completed running step `cram index` of: %s' % cram_path)
  return
def combinevcf(uuid, vcf_path_list, reference_fasta_path, thread_count, engine, logger):
  step_dir = os.getcwd()
  output_pon_vcf = os.path.join(step_dir, uuid) + '_PON.vcf'
  if pipe_util.already_step(step_dir, uuid + '_CombineVariants', logger):
    logger.info('already completed step `CombineVariants` of: %s' % vcf_path_list)
  else:
    logger.info('running step `CombineVariants` of: %s' % vcf_path_list)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-Djava.io.tmpdir=/tmp/job_tmp', '-d64', '-jar', gatk_path, '-T', 'CombineVariants', '-nt', str(thread_count), '-R', reference_fasta_path, '-minN 2', '--setKey "null"', '--filteredAreUncalled', '--filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED', '-o', output_pon_vcf]
    for vcf_path in vcf_path_list:
      cmd.extend(['-V', vcf_path])
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['output_pon_vcf'] = output_pon_vcf
    table_name = 'time_mem_gatk_CombineVariants'
    unique_key_dict = {'uuid': uuid, 'output_pon_vcf': output_pon_vcf}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_CombineVariants', logger)
    logger.info('completed running step `CombineVariants` of: %s' % vcf_path_list)
  return
def bamtofastq(uuid, bam_path, input_state, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    if pipe_util.already_step(step_dir, 'fastq', logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_name)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_name)
        home_dir = os.path.join('/home', getpass.getuser()) #cwltool sets HOME to /var/spool/cwl, so need to be explicit
        tempfq = os.path.join(step_dir, 'tempfq')
        bin_path = os.path.join(home_dir, 'tools', 'biobambam2', 'bin', 'bamtofastq')
        cmd = [bin_path, 'filename=' + bam_path, 'outputdir=' + step_dir, 'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=' + tempfq, 'gz=1', 'level=1', 'outputperreadgroupsuffixF=_1.fq.gz', 'outputperreadgroupsuffixF2=_2.fq.gz', 'outputperreadgroupsuffixO=_o1.fq.gz', 'outputperreadgroupsuffixO2=_o2.fq.gz', 'outputperreadgroupsuffixS=_s.fq.gz', 'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY']
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_state'] = input_state
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bamtofastq'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'fastq', logger)
        logger.info('completed running step `bamtofastq` of: %s' % bam_name)
    return
def bammarkduplicates2(uuid, bam_path, input_state, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = bam_name+'.metrics'
    tempfile = 'tempfile'
    logger.info('work_dir is: %s' % work_dir)
    if pipe_util.already_step(step_dir, 'md2', logger):
        logger.info('already completed step `bammarkduplicates2` of: %s' % bam_name)
    else:
        logger.info('running step `bammarkduplicates2` of %s: ' % bam_name)
        cmd = ['bammarkduplicates2', 'I=' + bam_path, 'O=' + bam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'markthreads='+str(cpu_count)]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_name'] = bam_name
        df['input_stage'] = input_stage
        unique_key_dict = {'uuid': uuid, 'bam_name': bam_name}
        table_name = 'time_mem_biobambam_bammarkduplicates2'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'md2', logger)
        logger.info('completed running step `bammarkduplicates2` of: %s' % bam_name)
    return
def bammerge(uuid, outbam_name, bam_path_list, reference_fasta_path, cpu_count, engine, logger):
    step_dir = os.getcwd()
    bam_name = os.path.basename(bam_path)
    metrics_name = outbam_name+'.metrics'
    tempfile = 'tempfile'
    if pipe_util.already_step(step_dir, 'merge', logger):
        logger.info('already completed step `bammerge` of: %s' % outbam_name)
    else:
        logger.info('running step `bammerge` of %s: ' % bam_path)
        for input_bam in bam_path_list:
            input_string = 'I=' + input_bam
        cmd = ['bammerge', 'O=' + outbam_name, 'M=' + metrics_name, 'verbose=0', 'level=-1', 'index=1', 'tmpfile=' + tempfile, 'SO=coordinate', input_string]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['outbam_name'] = outbam_name
        unique_key_dict = {'uuid': uuid, 'outbam_name': outbam_name}
        table_name = 'time_mem_biobambam_bammerge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'merge', logger)
        logger.info('completed running step `bammerge` of: %s' % outbam_name)
    return
def remove_duplicate_reads(uuid, fastq_dir, engine, logger):
    outdir = os.path.join(fastq_dir, 'rmdup')
    os.makedirs(outdir, exist_ok = True)
    fastq_list = buildfastqlist(fastq_dir, logger)
    for fastq_name in fastq_list:
        fastq_basename, fastq_ext = os.path.splitext(fastq_name)
        fastq_path = os.path.join(fastq_dir, fastq_name)
        outfile = os.path.join(outdir, fastq_name)
        log_path = os.path.join(outdir, 'rmdup_' + fastq_basename + '.log')
        logger.info('remove_duplicate_reads() fastq_path=%s' % fastq_path)
        logger.info('remove_duplicate_reads() outfile=%s' % outfile)
        logger.info('remove_duplicate_reads() log_path=%s' % log_path)
        if pipe_util.already_step(outdir, fastq_name + '_rmdup', logger):
            logger.info('already completed rmdup of: %s' % fastq_name)
        else:
            logger.info('running rmdup of: %s' % fastq_name)
            decomp_cmd = [ 'zcat', '"' + fastq_path + '"' ]
            home_dir = os.path.expanduser('~')
            python_cmd = os.path.join(home_dir, '.virtualenvs', 'p3', 'bin', 'python3')
            rmdup_cmd_path = os.path.join(home_dir, 'pipelines', 'dnaseq', 'other', 'remove_duplicate_mate_pair.py')
            rmdup_cmd = [ rmdup_cmd_path, '-l', '"' + log_path + '"' , '-' ]
            comp_cmd = [ 'gzip', '-', '>', '"' + outfile + '"' ]
            decomp_cmd_shell = ' '.join(decomp_cmd)
            #python_cmd_shell = ' '.join(python_cmd)
            rmdup_cmd_shell = ' '.join(rmdup_cmd)
            comp_cmd_shell = ' '.join(comp_cmd)
            shell_cmd = decomp_cmd_shell + ' | ' + python_cmd + ' ' + rmdup_cmd_shell + ' | ' + comp_cmd_shell
            logger.info('remove_duplicate_reads() shell_cmd=%s' % shell_cmd)
            output = pipe_util.do_shell_command(shell_cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, shell_cmd, output, logger)
            df['fastq'] = fastq_name
            unique_key_dict = {'uuid': uuid, 'fastq': fastq_name}
            table_name = 'time_mem_rmdup_fastq'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(outdir, fastq_name + '_rmdup', logger)
            logger.info('completed running rmdup of: %s' % fastq_name)

        #save stats to db
        if pipe_util.already_step(outdir, fastq_name + '_rmdup_db', logger):
            logger.info('already stored rmdup run of %s to db' % fastq_name)
        else:
            logger.info('storing rmdup run of %s to db' % fastq_name)
            #get details
            df = get_duplicate_qname_df(uuid, fastq_name, log_path, logger)
            table_name = 'rmdup_record_id'
            unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name, 'id':'impossible_match'}
            if len(df) > 0:
                df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)

            #get summary
            df = get_duplicate_summary(uuid, fastq_name, log_path, logger)
            table_name = 'rmdup_summary'
            unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)

            pipe_util.create_already_step(outdir, fastq_name + '_rmdup_db', logger)
            logger.info('completed storing rmdup run o0f %s to db' % fastq_name)
    return
def run_curl(case_id, path, uuid, token, engine, logger):
    step_dir = os.getcwd()
    os.makedirs(step_dir, exist_ok=True)
    if pipe_util.already_step(step_dir, case_id + '_' + uuid + '_download', logger):
        logger.info('already download %s' % uuid)
    else:
        logger.info('downloading %s from GDC portal' % uuid)
        for token in open(token):
            os.environ['TOKEN'] = token
            os.environ['https_proxy'] = "http://cloud-controller:3128"
            os.environ['http_proxy'] = "http://cloud-controller:3128"
            cmd = ['curl', 'https://gdc-api.nci.nih.gov/data/'+uuid, '-H', '"X-Auth-Token:$TOKEN"', '> '+path]
            shell_cmd = ' '.join(cmd)
            output = pipe_util.do_shell_command(shell_cmd, logger)
            df = time_util.store_time(uuid, shell_cmd, output, logger)
            df['uuid'] = uuid
            df['output'] = path
            table_name = 'time_mem_download'
            unique_key_dict = {'uuid': uuid, 'output': path}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, case_id + '_' + uuid + '_download', logger)
            logger.info('completed downloading %s' % uuid)
            del os.environ['https_proxy']
            del os.environ['http_proxy']
def trimmomatic(uuid, fastq_dir, adapter_pickle_path, thread_count, engine, logger):
    logger.info()
    fastq_list = buildfastqlist(fastq_dir)
    logging.info('fastqlist=%s' % fastq_list)
    pefastqdict = fastq_util.buildpefastqdict(fastq_list)
    logger.info('pefastqdict=%s' % pefastqdict)
    sefastqlist = fastq_util.buildsefastqlist(fastq_list)
    logger.info('sefastqlist=%s' % sefastqlist)
    o1fastqlist = fastq_util.buildo1fastqlist(fastq_list)
    logger.info('o1fastqlist=%s' % o1fastqlist)
    o2fastqlist = fastq_util.buildo2fastqlist(fastq_list)
    logger.info('o2fastqlist=%s' % o2fastqlist)
    trimmomatic_dir = os.path.join(fastq_dir,'trimmomatic')
    step_dir=trimmomatic_dir

    home_dir = os.path.expanduser('~')
    os.makedirs(trimmomatic_dir, exist_ok=True)
    for read1 in sorted(pefastqdict.keys()):
        read1_name, read1_ext = os.path.splitext(read1)
        fq1_in_path = os.path.join(fastq_dir, read1)
        fq2_in_path = os.path.join(fastq_dir, pefastqdict[read1])
        fq1_out_path = os.path.join(trimmomatic_dir, read1)
        fq1_unpaired_path = fq1_out_path + 'UP'
        fq2_out_path = os.path.join(trimmomatic_dir, pefastqdict[read1])
        fq2_unpaired_path = fq2_out_path + 'UP'
        diff1_path = fq1_out_path + '.diff'
        diff2_path = fq2_out_path + '.diff'
        diff1_xz_path = diff1_path + '.xz'
        diff2_xz_path = diff2_path + '.xz'
        diff1_name = os.path.basename(diff1_path)
        diff2_name = os.path.basename(diff2_path)
        fastq_type='PE'
        #generate trim
        if pipe_util.already_step(step_dir, 'trim_pe_' + read1_name, logger):
            logger.info('already completed pe trim on %s' % read1)
        else:
            logger.info('running step PE `trimmomatic` of: %s' % read1)
            trimmomatic_path = os.path.join(home_dir,'tools','trimmomatic','dist','jar','trimmomatic.jar')
            cmd = ['java', '-jar', trimmomatic_path, 'PE', '-threads', thread_count, '-phred33',
                   fq1_in_path, fq2_in_path, fq1_out_path, fq1_unpaired_path,
                   fq2_out_path, fq2_unpaired_path, 'ILLUMINACLIP:' + adapter_path]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = read1
            unique_key_dict = {'uuid': uuid, 'fastq': read1}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_' + read1_name, logger)
            logger.info('completed step PE `trimmomatic` of: %s' % read1)
            
        #generate diff
        if pipe_util.already_step(step_dir, 'trim_pe_diff_' + read1_name, logger):
            logger.info('already generated diff of trimmomatic of %s' % read1_name)
        else:
            logger.info('generating PE diff of trimmomatic of %s' % read1_name)
            cmd1 = ['diff', '-u', fq1_out_path, fq1_in_path, '>', diff1_path]
            cmd2 = ['diff', '-u', fq2_out_path, fq2_in_path, '>', diff2_path]
            shell_cmd1 = ' '.join(cmd1)
            shell_cmd1 = ' '.join(cmd2)
            output1 = pipe_util.do_shell_command(shell_cmd1, logger)
            output2 = pipe_util.do_shell_command(shell_cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            df1['fastq_type'] = fastq_type
            df2['fastq_type'] = fastq_type
            unique_key_dict1 = {'uuid': uuid, 'diff':diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff':diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_diff_' + read1_name, logger)
            logger.info('completed generating PE diff of trimmomatic of %s' % read1_name)
            
        #generate diff stats
        if pipe_util.already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger):
            logger.info('already completed step `summary stats of diff` of %s' % read1_name)
        else:
            logger.info('running step PE `summary of diff` of %s' % read1_name)
            trimmomatic_summ_met_dir = os.path.dirname(os.path.realpath(__file__))
            trimmomatic_summ_met_path = os.path.join(trimmomatic_summ_met_dir, 'trimmomatic_summary_metrics_from_diff.py')
            cmd1 = [trimmomatic_summ_met_path, '-d', diff1_path]
            cmd2 = [trimmomatic_summ_met_path, '-d', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict1 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff_summary'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger)
            
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger):
            logger.info('already stored PE `trimmomatic` of %s to db' % read1)
        else:
            logger.info('storing `trimmomatic` of %s to db' % read1)
            df = trimmomatic_diff_summary_to_df(uuid, read1, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_diff_summary'
            unique_key_dict1 = {'uuid': uuid, 'fastq': read1_name}
            unique_key_dict2 = {'uuid': uuid, 'fastq': read2_name}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger)
            logger.info('completed storing PE `trimmomatic` of %s to db' % read1)

        #compress diff
        if pipe_util.already_step(step_dir, 'xz_pe_diff_' + read1_name, logger):
            logger.info('already compressed PE diff: %s' % diff1_name)
        else:
            logger.info('compressing PE diff: %s' % diff1_name)
            cmd1 = ['xz', '-9', diff1_path]
            cmd2 = ['xz', '-9', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save timem/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_diff_xz'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'xz_pe_diff_' + read1_name, logger)
            logger.info('completed compressing PE diff: %s' % diff1_name)
            
    for seread in sefastqlist:
        read_name, read_ext = os.path.splitext(seread)
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name, logger):
            logger.info('already completed se trim on %s' % seread)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % seread)
            fq_in_path = os.path.join(fastq_dir, seread)
            fq_out_path = os.path.join(trimmomatic_dir, seread)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', 
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = seread
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % seread)
        else:
            logger.info('storing `trimmomatic` of %s to db' % seread)
            df = trimmomatic_log_to_df(uuid, seread, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % seread)
            
    for o1read in o1fastqlist:
        read_name, read_ext = os.path.splitext(o1read)
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name, logger):
            logger.info('already completed se trim on %s' % o1read)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % o1read)
            fq_in_path = os.path.join(fastq_dir, o1read)
            fq_out_path = os.path.join(trimmomatic_dir, o1read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o1read
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o1read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o1read)
            df = trimmomatic_log_to_df(uuid, o1read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o1read)

            
    for o2read in o2fastqlist:
        read_name, read_ext = os.path.splitext(o2read)
        if pipe_util.already_step(step_dir, 'trim_o2_' + read_name, logger):
            logger.info('already completed se trim on %s' % o2read)
        else:
            fq_in_path = os.path.join(fastq_dir, o2read)
            fq_out_path = os.path.join(trimmomatic_dir, o2read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o2read
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o2_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % o2read)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o2read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o2read)
            df = trimmomatic_log_to_df(uuid, o2read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o2read)