def remove_duplicate_reads(uuid, fastq_dir, engine, logger):
    outdir = os.path.join(fastq_dir, 'rmdup')
    os.makedirs(outdir, exist_ok = True)
    fastq_list = buildfastqlist(fastq_dir, logger)
    for fastq_name in fastq_list:
        fastq_basename, fastq_ext = os.path.splitext(fastq_name)
        fastq_path = os.path.join(fastq_dir, fastq_name)
        outfile = os.path.join(outdir, fastq_name)
        log_path = os.path.join(outdir, 'rmdup_' + fastq_basename + '.log')
        logger.info('remove_duplicate_reads() fastq_path=%s' % fastq_path)
        logger.info('remove_duplicate_reads() outfile=%s' % outfile)
        logger.info('remove_duplicate_reads() log_path=%s' % log_path)
        if pipe_util.already_step(outdir, fastq_name + '_rmdup', logger):
            logger.info('already completed rmdup of: %s' % fastq_name)
        else:
            logger.info('running rmdup of: %s' % fastq_name)
            decomp_cmd = [ 'zcat', '"' + fastq_path + '"' ]
            home_dir = os.path.expanduser('~')
            python_cmd = os.path.join(home_dir, '.virtualenvs', 'p3', 'bin', 'python3')
            rmdup_cmd_path = os.path.join(home_dir, 'pipelines', 'dnaseq', 'other', 'remove_duplicate_mate_pair.py')
            rmdup_cmd = [ rmdup_cmd_path, '-l', '"' + log_path + '"' , '-' ]
            comp_cmd = [ 'gzip', '-', '>', '"' + outfile + '"' ]
            decomp_cmd_shell = ' '.join(decomp_cmd)
            #python_cmd_shell = ' '.join(python_cmd)
            rmdup_cmd_shell = ' '.join(rmdup_cmd)
            comp_cmd_shell = ' '.join(comp_cmd)
            shell_cmd = decomp_cmd_shell + ' | ' + python_cmd + ' ' + rmdup_cmd_shell + ' | ' + comp_cmd_shell
            logger.info('remove_duplicate_reads() shell_cmd=%s' % shell_cmd)
            output = pipe_util.do_shell_command(shell_cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, shell_cmd, output, logger)
            df['fastq'] = fastq_name
            unique_key_dict = {'uuid': uuid, 'fastq': fastq_name}
            table_name = 'time_mem_rmdup_fastq'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(outdir, fastq_name + '_rmdup', logger)
            logger.info('completed running rmdup of: %s' % fastq_name)

        #save stats to db
        if pipe_util.already_step(outdir, fastq_name + '_rmdup_db', logger):
            logger.info('already stored rmdup run of %s to db' % fastq_name)
        else:
            logger.info('storing rmdup run of %s to db' % fastq_name)
            #get details
            df = get_duplicate_qname_df(uuid, fastq_name, log_path, logger)
            table_name = 'rmdup_record_id'
            unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name, 'id':'impossible_match'}
            if len(df) > 0:
                df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)

            #get summary
            df = get_duplicate_summary(uuid, fastq_name, log_path, logger)
            table_name = 'rmdup_summary'
            unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)

            pipe_util.create_already_step(outdir, fastq_name + '_rmdup_db', logger)
            logger.info('completed storing rmdup run o0f %s to db' % fastq_name)
    return
def run_curl(case_id, path, uuid, token, engine, logger):
    step_dir = os.getcwd()
    os.makedirs(step_dir, exist_ok=True)
    if pipe_util.already_step(step_dir, case_id + '_' + uuid + '_download', logger):
        logger.info('already download %s' % uuid)
    else:
        logger.info('downloading %s from GDC portal' % uuid)
        for token in open(token):
            os.environ['TOKEN'] = token
            os.environ['https_proxy'] = "http://cloud-controller:3128"
            os.environ['http_proxy'] = "http://cloud-controller:3128"
            cmd = ['curl', 'https://gdc-api.nci.nih.gov/data/'+uuid, '-H', '"X-Auth-Token:$TOKEN"', '> '+path]
            shell_cmd = ' '.join(cmd)
            output = pipe_util.do_shell_command(shell_cmd, logger)
            df = time_util.store_time(uuid, shell_cmd, output, logger)
            df['uuid'] = uuid
            df['output'] = path
            table_name = 'time_mem_download'
            unique_key_dict = {'uuid': uuid, 'output': path}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, case_id + '_' + uuid + '_download', logger)
            logger.info('completed downloading %s' % uuid)
            del os.environ['https_proxy']
            del os.environ['http_proxy']
def trimmomatic(uuid, fastq_dir, adapter_pickle_path, thread_count, engine, logger):
    logger.info()
    fastq_list = buildfastqlist(fastq_dir)
    logging.info('fastqlist=%s' % fastq_list)
    pefastqdict = fastq_util.buildpefastqdict(fastq_list)
    logger.info('pefastqdict=%s' % pefastqdict)
    sefastqlist = fastq_util.buildsefastqlist(fastq_list)
    logger.info('sefastqlist=%s' % sefastqlist)
    o1fastqlist = fastq_util.buildo1fastqlist(fastq_list)
    logger.info('o1fastqlist=%s' % o1fastqlist)
    o2fastqlist = fastq_util.buildo2fastqlist(fastq_list)
    logger.info('o2fastqlist=%s' % o2fastqlist)
    trimmomatic_dir = os.path.join(fastq_dir,'trimmomatic')
    step_dir=trimmomatic_dir

    home_dir = os.path.expanduser('~')
    os.makedirs(trimmomatic_dir, exist_ok=True)
    for read1 in sorted(pefastqdict.keys()):
        read1_name, read1_ext = os.path.splitext(read1)
        fq1_in_path = os.path.join(fastq_dir, read1)
        fq2_in_path = os.path.join(fastq_dir, pefastqdict[read1])
        fq1_out_path = os.path.join(trimmomatic_dir, read1)
        fq1_unpaired_path = fq1_out_path + 'UP'
        fq2_out_path = os.path.join(trimmomatic_dir, pefastqdict[read1])
        fq2_unpaired_path = fq2_out_path + 'UP'
        diff1_path = fq1_out_path + '.diff'
        diff2_path = fq2_out_path + '.diff'
        diff1_xz_path = diff1_path + '.xz'
        diff2_xz_path = diff2_path + '.xz'
        diff1_name = os.path.basename(diff1_path)
        diff2_name = os.path.basename(diff2_path)
        fastq_type='PE'
        #generate trim
        if pipe_util.already_step(step_dir, 'trim_pe_' + read1_name, logger):
            logger.info('already completed pe trim on %s' % read1)
        else:
            logger.info('running step PE `trimmomatic` of: %s' % read1)
            trimmomatic_path = os.path.join(home_dir,'tools','trimmomatic','dist','jar','trimmomatic.jar')
            cmd = ['java', '-jar', trimmomatic_path, 'PE', '-threads', thread_count, '-phred33',
                   fq1_in_path, fq2_in_path, fq1_out_path, fq1_unpaired_path,
                   fq2_out_path, fq2_unpaired_path, 'ILLUMINACLIP:' + adapter_path]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = read1
            unique_key_dict = {'uuid': uuid, 'fastq': read1}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_' + read1_name, logger)
            logger.info('completed step PE `trimmomatic` of: %s' % read1)
            
        #generate diff
        if pipe_util.already_step(step_dir, 'trim_pe_diff_' + read1_name, logger):
            logger.info('already generated diff of trimmomatic of %s' % read1_name)
        else:
            logger.info('generating PE diff of trimmomatic of %s' % read1_name)
            cmd1 = ['diff', '-u', fq1_out_path, fq1_in_path, '>', diff1_path]
            cmd2 = ['diff', '-u', fq2_out_path, fq2_in_path, '>', diff2_path]
            shell_cmd1 = ' '.join(cmd1)
            shell_cmd1 = ' '.join(cmd2)
            output1 = pipe_util.do_shell_command(shell_cmd1, logger)
            output2 = pipe_util.do_shell_command(shell_cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            df1['fastq_type'] = fastq_type
            df2['fastq_type'] = fastq_type
            unique_key_dict1 = {'uuid': uuid, 'diff':diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff':diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_diff_' + read1_name, logger)
            logger.info('completed generating PE diff of trimmomatic of %s' % read1_name)
            
        #generate diff stats
        if pipe_util.already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger):
            logger.info('already completed step `summary stats of diff` of %s' % read1_name)
        else:
            logger.info('running step PE `summary of diff` of %s' % read1_name)
            trimmomatic_summ_met_dir = os.path.dirname(os.path.realpath(__file__))
            trimmomatic_summ_met_path = os.path.join(trimmomatic_summ_met_dir, 'trimmomatic_summary_metrics_from_diff.py')
            cmd1 = [trimmomatic_summ_met_path, '-d', diff1_path]
            cmd2 = [trimmomatic_summ_met_path, '-d', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save time/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict1 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_trimmomatic_diff_summary'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger)
            
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger):
            logger.info('already stored PE `trimmomatic` of %s to db' % read1)
        else:
            logger.info('storing `trimmomatic` of %s to db' % read1)
            df = trimmomatic_diff_summary_to_df(uuid, read1, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_diff_summary'
            unique_key_dict1 = {'uuid': uuid, 'fastq': read1_name}
            unique_key_dict2 = {'uuid': uuid, 'fastq': read2_name}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger)
            logger.info('completed storing PE `trimmomatic` of %s to db' % read1)

        #compress diff
        if pipe_util.already_step(step_dir, 'xz_pe_diff_' + read1_name, logger):
            logger.info('already compressed PE diff: %s' % diff1_name)
        else:
            logger.info('compressing PE diff: %s' % diff1_name)
            cmd1 = ['xz', '-9', diff1_path]
            cmd2 = ['xz', '-9', diff2_path]
            output1 = pipe_util.do_command(cmd1, logger)
            output2 = pipe_util.do_command(cmd2, logger)

            #save timem/mem to db
            df1 = time_util.store_time(uuid, cmd1, output1, logger)
            df2 = time_util.store_time(uuid, cmd2, output2, logger)
            df1['diff'] = diff1_name
            df2['diff'] = diff2_name
            unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type}
            unique_key_dict2 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type}
            table_name = 'time_mem_diff_xz'
            df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger)
            df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'xz_pe_diff_' + read1_name, logger)
            logger.info('completed compressing PE diff: %s' % diff1_name)
            
    for seread in sefastqlist:
        read_name, read_ext = os.path.splitext(seread)
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name, logger):
            logger.info('already completed se trim on %s' % seread)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % seread)
            fq_in_path = os.path.join(fastq_dir, seread)
            fq_out_path = os.path.join(trimmomatic_dir, seread)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', 
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = seread
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % seread)
        else:
            logger.info('storing `trimmomatic` of %s to db' % seread)
            df = trimmomatic_log_to_df(uuid, seread, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': seread}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % seread)
            
    for o1read in o1fastqlist:
        read_name, read_ext = os.path.splitext(o1read)
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name, logger):
            logger.info('already completed se trim on %s' % o1read)
        else:
            logger.info('running step SE `trimmomatic` of: %s' % o1read)
            fq_in_path = os.path.join(fastq_dir, o1read)
            fq_out_path = os.path.join(trimmomatic_dir, o1read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o1read
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_o1_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o1read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o1read)
            df = trimmomatic_log_to_df(uuid, o1read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o1read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o1read)

            
    for o2read in o2fastqlist:
        read_name, read_ext = os.path.splitext(o2read)
        if pipe_util.already_step(step_dir, 'trim_o2_' + read_name, logger):
            logger.info('already completed se trim on %s' % o2read)
        else:
            fq_in_path = os.path.join(fastq_dir, o2read)
            fq_out_path = os.path.join(trimmomatic_dir, o2read)
            cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33',
                 fq_in_path, fq_out_path, 'ILLUMINACLIP:', ]
            output = pipe_util.do_command(cmd, logger)

            #save time/mem to db
            df = time_util.store_time(uuid, cmd, output, logger)
            df['fastq'] = o2read
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            table_name = 'time_mem_trimmomatic'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_o2_' + read_name, logger)
            logger.info('completed running step SE `trimmomatic` of: %s' % o2read)
        #save stats to db
        if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger):
            logger.info('already stored SE `trimmomatic` of %s to db' % o2read)
        else:
            logger.info('storing `trimmomatic` of %s to db' % o2read)
            df = trimmomatic_log_to_df(uuid, o2read, trimlog_path, logger)
            df['uuid'] = uuid
            table_name = 'trimmomatic_log'
            unique_key_dict = {'uuid': uuid, 'fastq': o2read}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger)
            logger.info('completed storing SE `trimmomatic` of %s to db' % o2read)