def remove_duplicate_reads(uuid, fastq_dir, engine, logger): outdir = os.path.join(fastq_dir, 'rmdup') os.makedirs(outdir, exist_ok = True) fastq_list = buildfastqlist(fastq_dir, logger) for fastq_name in fastq_list: fastq_basename, fastq_ext = os.path.splitext(fastq_name) fastq_path = os.path.join(fastq_dir, fastq_name) outfile = os.path.join(outdir, fastq_name) log_path = os.path.join(outdir, 'rmdup_' + fastq_basename + '.log') logger.info('remove_duplicate_reads() fastq_path=%s' % fastq_path) logger.info('remove_duplicate_reads() outfile=%s' % outfile) logger.info('remove_duplicate_reads() log_path=%s' % log_path) if pipe_util.already_step(outdir, fastq_name + '_rmdup', logger): logger.info('already completed rmdup of: %s' % fastq_name) else: logger.info('running rmdup of: %s' % fastq_name) decomp_cmd = [ 'zcat', '"' + fastq_path + '"' ] home_dir = os.path.expanduser('~') python_cmd = os.path.join(home_dir, '.virtualenvs', 'p3', 'bin', 'python3') rmdup_cmd_path = os.path.join(home_dir, 'pipelines', 'dnaseq', 'other', 'remove_duplicate_mate_pair.py') rmdup_cmd = [ rmdup_cmd_path, '-l', '"' + log_path + '"' , '-' ] comp_cmd = [ 'gzip', '-', '>', '"' + outfile + '"' ] decomp_cmd_shell = ' '.join(decomp_cmd) #python_cmd_shell = ' '.join(python_cmd) rmdup_cmd_shell = ' '.join(rmdup_cmd) comp_cmd_shell = ' '.join(comp_cmd) shell_cmd = decomp_cmd_shell + ' | ' + python_cmd + ' ' + rmdup_cmd_shell + ' | ' + comp_cmd_shell logger.info('remove_duplicate_reads() shell_cmd=%s' % shell_cmd) output = pipe_util.do_shell_command(shell_cmd, logger) #save time/mem to db df = time_util.store_time(uuid, shell_cmd, output, logger) df['fastq'] = fastq_name unique_key_dict = {'uuid': uuid, 'fastq': fastq_name} table_name = 'time_mem_rmdup_fastq' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir, fastq_name + '_rmdup', logger) logger.info('completed running rmdup of: %s' % fastq_name) #save stats to db if pipe_util.already_step(outdir, fastq_name + '_rmdup_db', logger): logger.info('already stored rmdup run of %s to db' % fastq_name) else: logger.info('storing rmdup run of %s to db' % fastq_name) #get details df = get_duplicate_qname_df(uuid, fastq_name, log_path, logger) table_name = 'rmdup_record_id' unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name, 'id':'impossible_match'} if len(df) > 0: df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) #get summary df = get_duplicate_summary(uuid, fastq_name, log_path, logger) table_name = 'rmdup_summary' unique_key_dict = {'uuid':uuid, 'fastq_name': fastq_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(outdir, fastq_name + '_rmdup_db', logger) logger.info('completed storing rmdup run o0f %s to db' % fastq_name) return
def run_curl(case_id, path, uuid, token, engine, logger): step_dir = os.getcwd() os.makedirs(step_dir, exist_ok=True) if pipe_util.already_step(step_dir, case_id + '_' + uuid + '_download', logger): logger.info('already download %s' % uuid) else: logger.info('downloading %s from GDC portal' % uuid) for token in open(token): os.environ['TOKEN'] = token os.environ['https_proxy'] = "http://cloud-controller:3128" os.environ['http_proxy'] = "http://cloud-controller:3128" cmd = ['curl', 'https://gdc-api.nci.nih.gov/data/'+uuid, '-H', '"X-Auth-Token:$TOKEN"', '> '+path] shell_cmd = ' '.join(cmd) output = pipe_util.do_shell_command(shell_cmd, logger) df = time_util.store_time(uuid, shell_cmd, output, logger) df['uuid'] = uuid df['output'] = path table_name = 'time_mem_download' unique_key_dict = {'uuid': uuid, 'output': path} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, case_id + '_' + uuid + '_download', logger) logger.info('completed downloading %s' % uuid) del os.environ['https_proxy'] del os.environ['http_proxy']
def trimmomatic(uuid, fastq_dir, adapter_pickle_path, thread_count, engine, logger): logger.info() fastq_list = buildfastqlist(fastq_dir) logging.info('fastqlist=%s' % fastq_list) pefastqdict = fastq_util.buildpefastqdict(fastq_list) logger.info('pefastqdict=%s' % pefastqdict) sefastqlist = fastq_util.buildsefastqlist(fastq_list) logger.info('sefastqlist=%s' % sefastqlist) o1fastqlist = fastq_util.buildo1fastqlist(fastq_list) logger.info('o1fastqlist=%s' % o1fastqlist) o2fastqlist = fastq_util.buildo2fastqlist(fastq_list) logger.info('o2fastqlist=%s' % o2fastqlist) trimmomatic_dir = os.path.join(fastq_dir,'trimmomatic') step_dir=trimmomatic_dir home_dir = os.path.expanduser('~') os.makedirs(trimmomatic_dir, exist_ok=True) for read1 in sorted(pefastqdict.keys()): read1_name, read1_ext = os.path.splitext(read1) fq1_in_path = os.path.join(fastq_dir, read1) fq2_in_path = os.path.join(fastq_dir, pefastqdict[read1]) fq1_out_path = os.path.join(trimmomatic_dir, read1) fq1_unpaired_path = fq1_out_path + 'UP' fq2_out_path = os.path.join(trimmomatic_dir, pefastqdict[read1]) fq2_unpaired_path = fq2_out_path + 'UP' diff1_path = fq1_out_path + '.diff' diff2_path = fq2_out_path + '.diff' diff1_xz_path = diff1_path + '.xz' diff2_xz_path = diff2_path + '.xz' diff1_name = os.path.basename(diff1_path) diff2_name = os.path.basename(diff2_path) fastq_type='PE' #generate trim if pipe_util.already_step(step_dir, 'trim_pe_' + read1_name, logger): logger.info('already completed pe trim on %s' % read1) else: logger.info('running step PE `trimmomatic` of: %s' % read1) trimmomatic_path = os.path.join(home_dir,'tools','trimmomatic','dist','jar','trimmomatic.jar') cmd = ['java', '-jar', trimmomatic_path, 'PE', '-threads', thread_count, '-phred33', fq1_in_path, fq2_in_path, fq1_out_path, fq1_unpaired_path, fq2_out_path, fq2_unpaired_path, 'ILLUMINACLIP:' + adapter_path] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = read1 unique_key_dict = {'uuid': uuid, 'fastq': read1} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_' + read1_name, logger) logger.info('completed step PE `trimmomatic` of: %s' % read1) #generate diff if pipe_util.already_step(step_dir, 'trim_pe_diff_' + read1_name, logger): logger.info('already generated diff of trimmomatic of %s' % read1_name) else: logger.info('generating PE diff of trimmomatic of %s' % read1_name) cmd1 = ['diff', '-u', fq1_out_path, fq1_in_path, '>', diff1_path] cmd2 = ['diff', '-u', fq2_out_path, fq2_in_path, '>', diff2_path] shell_cmd1 = ' '.join(cmd1) shell_cmd1 = ' '.join(cmd2) output1 = pipe_util.do_shell_command(shell_cmd1, logger) output2 = pipe_util.do_shell_command(shell_cmd2, logger) #save time/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name df1['fastq_type'] = fastq_type df2['fastq_type'] = fastq_type unique_key_dict1 = {'uuid': uuid, 'diff':diff1_name, 'fastq_type': fastq_type} unique_key_dict2 = {'uuid': uuid, 'diff':diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_trimmomatic_diff' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_diff_' + read1_name, logger) logger.info('completed generating PE diff of trimmomatic of %s' % read1_name) #generate diff stats if pipe_util.already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger): logger.info('already completed step `summary stats of diff` of %s' % read1_name) else: logger.info('running step PE `summary of diff` of %s' % read1_name) trimmomatic_summ_met_dir = os.path.dirname(os.path.realpath(__file__)) trimmomatic_summ_met_path = os.path.join(trimmomatic_summ_met_dir, 'trimmomatic_summary_metrics_from_diff.py') cmd1 = [trimmomatic_summ_met_path, '-d', diff1_path] cmd2 = [trimmomatic_summ_met_path, '-d', diff2_path] output1 = pipe_util.do_command(cmd1, logger) output2 = pipe_util.do_command(cmd2, logger) #save time/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type} unique_key_dict1 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_trimmomatic_diff_summary' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_summary_diff_log_' + read1_name, logger) #save stats to db if pipe_util.already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger): logger.info('already stored PE `trimmomatic` of %s to db' % read1) else: logger.info('storing `trimmomatic` of %s to db' % read1) df = trimmomatic_diff_summary_to_df(uuid, read1, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_diff_summary' unique_key_dict1 = {'uuid': uuid, 'fastq': read1_name} unique_key_dict2 = {'uuid': uuid, 'fastq': read2_name} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_pe_summary_db_' + read1_name + '_db', logger) logger.info('completed storing PE `trimmomatic` of %s to db' % read1) #compress diff if pipe_util.already_step(step_dir, 'xz_pe_diff_' + read1_name, logger): logger.info('already compressed PE diff: %s' % diff1_name) else: logger.info('compressing PE diff: %s' % diff1_name) cmd1 = ['xz', '-9', diff1_path] cmd2 = ['xz', '-9', diff2_path] output1 = pipe_util.do_command(cmd1, logger) output2 = pipe_util.do_command(cmd2, logger) #save timem/mem to db df1 = time_util.store_time(uuid, cmd1, output1, logger) df2 = time_util.store_time(uuid, cmd2, output2, logger) df1['diff'] = diff1_name df2['diff'] = diff2_name unique_key_dict1 = {'uuid': uuid, 'diff': diff1_name, 'fastq_type': fastq_type} unique_key_dict2 = {'uuid': uuid, 'diff': diff2_name, 'fastq_type': fastq_type} table_name = 'time_mem_diff_xz' df_util.save_df_to_sqlalchemy(df1, unique_key_dict1, table_name, engine, logger) df_util.save_df_to_sqlalchemy(df2, unique_key_dict2, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'xz_pe_diff_' + read1_name, logger) logger.info('completed compressing PE diff: %s' % diff1_name) for seread in sefastqlist: read_name, read_ext = os.path.splitext(seread) if pipe_util.already_step(step_dir, 'trim_se_' + read_name, logger): logger.info('already completed se trim on %s' % seread) else: logger.info('running step SE `trimmomatic` of: %s' % seread) fq_in_path = os.path.join(fastq_dir, seread) fq_out_path = os.path.join(trimmomatic_dir, seread) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = seread unique_key_dict = {'uuid': uuid, 'fastq': seread} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path) #save stats to db if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % seread) else: logger.info('storing `trimmomatic` of %s to db' % seread) df = trimmomatic_log_to_df(uuid, seread, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': seread} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % seread) for o1read in o1fastqlist: read_name, read_ext = os.path.splitext(o1read) if pipe_util.already_step(step_dir, 'trim_o1_' + read_name, logger): logger.info('already completed se trim on %s' % o1read) else: logger.info('running step SE `trimmomatic` of: %s' % o1read) fq_in_path = os.path.join(fastq_dir, o1read) fq_out_path = os.path.join(trimmomatic_dir, o1read) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = o1read unique_key_dict = {'uuid': uuid, 'fastq': o1read} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % fq_in_path) #save stats to db if pipe_util.already_step(step_dir, 'trim_o1_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % o1read) else: logger.info('storing `trimmomatic` of %s to db' % o1read) df = trimmomatic_log_to_df(uuid, o1read, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': o1read} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o1_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % o1read) for o2read in o2fastqlist: read_name, read_ext = os.path.splitext(o2read) if pipe_util.already_step(step_dir, 'trim_o2_' + read_name, logger): logger.info('already completed se trim on %s' % o2read) else: fq_in_path = os.path.join(fastq_dir, o2read) fq_out_path = os.path.join(trimmomatic_dir, o2read) cmd=['java', '-jar', trimmomatic_path, 'SE', '-threads', thread_count, '-phred33', fq_in_path, fq_out_path, 'ILLUMINACLIP:', ] output = pipe_util.do_command(cmd, logger) #save time/mem to db df = time_util.store_time(uuid, cmd, output, logger) df['fastq'] = o2read unique_key_dict = {'uuid': uuid, 'fastq': o2read} table_name = 'time_mem_trimmomatic' df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_o2_' + read_name, logger) logger.info('completed running step SE `trimmomatic` of: %s' % o2read) #save stats to db if pipe_util.already_step(step_dir, 'trim_se_' + read_name + '_db', logger): logger.info('already stored SE `trimmomatic` of %s to db' % o2read) else: logger.info('storing `trimmomatic` of %s to db' % o2read) df = trimmomatic_log_to_df(uuid, o2read, trimlog_path, logger) df['uuid'] = uuid table_name = 'trimmomatic_log' unique_key_dict = {'uuid': uuid, 'fastq': o2read} df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger) pipe_util.create_already_step(step_dir, 'trim_se_' + read_name + '_db', logger) logger.info('completed storing SE `trimmomatic` of %s to db' % o2read)