def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, bam_name + '_validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        cmd = ['java', '-d64', '-Xmx16G', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        df['validate_file'] = validate_file
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path, 'validate_file': validate_file}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, bam_name + '_validate', logger)
        logger.info('completed running step `picard validate` of: %s' % bam_path)
    if pipe_util.already_step(step_dir, bam_name + '_validate_db', logger):
        logger.info('alread stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_validate_db', logger)
        logger.info('completed storing `picard validate` to db')
Esempio n. 2
0
def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, 'validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        
        cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, 'validate', logger)

    if pipe_util.already_step(step_dir, 'validate_db', logger):
        logger.info('alreaddy stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, 'validate_db', logger)
        logger.info('completed storing `picard validate` to db')
        
                                                    
                        
Esempio n. 3
0
def bwa_aln_single(uuid,bam_path,fastq_dir,read1,realn_dir,readkey,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    se_realn_dir=os.path.join(realn_dir,'bwa_aln_'+readkey)
    logger.info('se_realn_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename=read1.replace('_'+readkey+'.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai=os.path.basename(fastqbasename+'.sai')
    outbam=os.path.basename(fastqbasename+'.bam')
    outsai_path=os.path.join(se_realn_dir,outsai)
    outbam_path=os.path.join(se_realn_dir,outbam)
    read1_name,read1_ext=os.path.splitext(read1)
    sai1_name=read1_name+'.sai'
    sai1_path=os.path.join(pe_realn_dir,sai1_name)
    f1=os.path.join(fastq_dir,read1)
    os.makedirs(se_realn_dir,exist_ok=True)
    if pipe_util.already_step(se_realn_dir,readkey+'_sai_'+fastqbasename,logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_cmd=['bwa','aln',reference_fasta_path,'-t '+thread_count,f1,' > ',outsai_path]
        shell_aln_cmd=' '.join(aln_cmd)
        output=pipe_util.do_shell_command(shell_aln_cmd,logger)
        df=time_util.store_time(uuid,shell_aln_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)

    if pipe_util.already_step(se_realn_dir,readkey+'_samse_'+fastqbasename,logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        samse_cmd=['bwa','samse',reference_fasta_path,'-r '+'"'+rg_str+'"']
        samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        shell_samse_cmd=' '.join(samse_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_samse_cmd+' | '+shell_samtools_cmd
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        logger.info('completed running step `bwa mem single` of: %s' % bam_path)
        pipe_util.create_already_step(se_realn_dir,readkey+'_'+fastqbasename,logger)
    return outbam_path
def RTC(uuid, analysis_ready_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger):
  RTC_dir = os.path.dirname(analysis_ready_bam_list_path)
  bam_list_name = os.path.basename(analysis_ready_bam_list_path)
  bam_base, bam_ext = os.path.splitext(bam_list_name)
  logger.info('RTC_dir=%s' % RTC_dir)
  step_dir = RTC_dir
  outintervals = bam_base + '.intervals'
  intervals_path = os.path.join(RTC_dir, outintervals)
  logger.info('intervals_path=%s' % intervals_path)
  if pipe_util.already_step(step_dir, uuid + '_RealignerTargetCreator', logger):
    logger.info('already completed step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  else:
    logger.info('running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nt ' + thread_count, '-T RealignerTargetCreator', '-R ' + reference_fasta_name, '-I ' + analysis_ready_bam_list_path, '-known ' + known_1k_genome_indel_sites, '-o ' + intervals_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['intervals_path'] = intervals_path
    df['analysis_ready_bam_list_path'] = analysis_ready_bam_list_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_RTC'
    unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_list_path': analysis_ready_bam_list_path, 'thread_count': thread_count, 'intervals_path': intervals_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, uuid + '_RealignerTargetCreator', logger)
    logger.info('completed running step `RealignerTargetCreator` of: %s' % analysis_ready_bam_list_path)
  return intervals_path
Esempio n. 5
0
def do_guess_encoding(uuid,fastq_path,engine,logger):
    fastq_name=os.path.basename(fastq_path)
    fastq_dir=os.path.dirname(fastq_path)
    fastq_base,fastq_ext=os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir,'guess_'+fastq_base,logger):
        logger.info('already completed step `guess_encoding`: %s' % fastq_path)
    else:
        logger.info('running step `guess_encoding` of %s' % fastq_path)
        pipe_dir=os.path.dirname(os.path.realpath(sys.argv[0]))
        guess_path=os.path.join(pipe_dir,'guess-encoding.py')
        guess_cmd='python2 '+guess_path
        time_cmd='/usr/bin/time -v '+guess_cmd+' -f '+fastq_path
        proc=subprocess.Popen(time_cmd,shell=True,stderr=subprocess.STDOUT,stdout=subprocess.PIPE)
        output=proc.communicate()[0]
        logger.info('output=%s' % output)
        df=time_util.store_time(uuid,time_cmd,output,logger)
        df['fastq_path']=fastq_path
        table_name='time_mem_guessencoding'
        unique_key_dict={'uuid':uuid,'fastq_path':fastq_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        #cmdlist=list()
        #cmdlist.append(awk_shlex)
        #cmdlist.append(guess_cmd)
        #output=pipe_util.do_piped_commands(cmdlist,logger)
        logger.info('do_guess_encoding output=%s' % output.decode())
        write_fastq_format(fastq_path,output,logger)
        pipe_util.create_already_step(fastq_dir,'guess_'+fastq_base,logger)
    return    
Esempio n. 6
0
def get_file_md5(uuid,file_path,engine,logger):
    file_dir=os.path.dirname(file_path)
    file_name=os.path.basename(file_path)
    file_shortname,file_ext=os.path.splitext(file_name)
    file_md5_name=file_name+'.md5'
    file_md5_path=os.path.join(file_dir,file_md5_name)
    if pipe_util.already_step(file_dir,file_name+'_md5sum',logger):
        logger.info('already completed step `md5sum` of: %s' % file_path)
        with open(file_md5_path,'r') as file_md5_path_open:
            file_md5=file_md5_path_open.readline().strip()
            return file_md5
    else:
        cmd=['md5sum',file_path]
        output=pipe_util.do_command(cmd,logger)
        file_md5=output.split()[0].decode()
        file_md5_path_open=open(file_md5_path,'w')
        file_md5_path_open.write(file_md5)
        file_md5_path_open.close()
        df=time_util.store_time(uuid,cmd,output,logger)
        df['file_path']=file_path
        logger.info('df=%s' % df)
        unique_key_dict={'uuid':uuid,'file_path':file_path}
        table_name='time_mem_md5'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(file_dir,file_name+'_md5sum',logger)
        return file_md5
    return None
def HC(uuid, analysis_ready_bam_list_path, intervals, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger):
  HC_dir = os.path.dirname(analysis_ready_bam_list_path)
  logger.info('HC_dir=%s' % HC_dir)
  step_dir = HC_dir
  hc_output_gvcfs = []
  with open(analysis_ready_bam_list_path) as f:
      analysis_ready_bam_path = f.read().splitlines()
      for bam in analysis_ready_bam_path:
        bam_name = os.path.basename(bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        out_gvcf = bam_base + '.raw.indels.raw.snps.g.vcf'
        out_gvcf_path = os.path.join(HC_dir, out_gvcf)
        logger.info('out_gvcf_path=%s' % out_gvcf_path)
        hc_output_gvcfs.append(out_gvcf_path)
        if pipe_util.already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger):
          logger.info('already completed step `HaplotypeCaller` of: %s' % bam)
        else:
          logger.info('running step `HaplotypeCaller` of: %s' % bam)
          home_dir = os.path.expanduser('~')
          gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
          cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T HaplotypeCaller', '-R ' + reference_fasta_name, '-I ' + bam, '--emitRefConfidence GVCF', '--variant_index_type LINEAR', '--variant_index_parameter 128000', '--dbsnp ' + dbsnp_known_snp_sites, '-L ' + intervals, '--max_alternate_alleles 50', '-o ' + out_gvcf_path]
          shell_cmd = ' '.join(cmd)
          output = pipe_util.do_shell_command(shell_cmd, logger)
          df = time_util.store_time(uuid, shell_cmd, output, logger)
          df['out_gvcf_path'] = out_gvcf_path
          df['analysis_ready_bam_path'] = bam
          df['thread_count'] = thread_count
          table_name = 'time_mem_GATK_HaplotypeCaller'
          unique_key_dict = {'uuid': uuid,  'analysis_ready_bam_path': bam, 'thread_count': thread_count, 'out_gvcf': out_gvcf_path}
          df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
          pipe_util.create_already_step(step_dir, uuid + '_' + bam_base + '_HaplotypeCaller', logger)
          logger.info('completed running step `HaplotypeCaller` of: %s' % bam)
  return hc_output_gvcfs
def picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if pipe_util.already_step(sd_dir, ref_name + "_dict", logger):
        logger.info("already completed step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
    else:
        logger.info("running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "CreateSequenceDictionary",
            "R=" + reference_fasta_name,
            "O=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["sequence_dictionary"] = sd_file_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "sequence_dictionary": sd_file_path}
        table_name = "time_mem_picard_CreateSequenceDictionary"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        pipe_util.create_already_step(sd_dir, ref_name + "_dict", logger)
    return sd_file_path
Esempio n. 9
0
def guess_enc_db(uuid, fq_path, engine, logger):
    fastq_dir = os.path.dirname(fq_path)
    fastq_name = os.path.basename(fq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    guess_enc_path = fq_path + '.format'
    guess_enc_value = str()
    with open(guess_enc_path, 'r') as guess_enc_open:
        guess_enc_value = guess_enc_open.readline().strip()
    data_dict = dict()
    if pipe_util.already_step(fastq_dir, 'fastq_encdb_' + fastq_base, logger):
        logger.info('writing `guess_enc_db`: %s' % fq_path)
    else:
        logger.info('writing `guess_enc_db`: %s' % fq_path)
        data_dict['uuid'] = [uuid]
        data_dict['fastq_name'] = fastq_name
        data_dict['guess'] = guess_enc_value
        df = pd.DataFrame(data_dict)
        table_name = 'guess_fastq_encoding'
        unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastq_encdb_' + fastq_base,
                                      logger)
        logger.info('completed writing `guess_enc_db`: %s' % fq_path)
    return
def sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger):
  sump_dir = os.path.dirname(muse_call_output_path)
  input_name = os.path.basename(muse_call_output_path)
  input_base, input_ext = os.path.splitext(input_name)
  sample_base, sample_ext = os.path.splitext(input_base)
  logger.info('MuSE_sump_dir=%s' % sump_dir)
  step_dir = sump_dir
  muse_sump_output = input_base + '.vcf'
  muse_sump_output_path = os.path.join(sump_dir, muse_sump_output)
  logger.info('muse_sump_output_path=%s' % muse_sump_output_path)
  if pipe_util.already_step(step_dir, sample_base + '_MuSE_sump', logger):
    logger.info('already completed step `MuSE sump` of: %s' % input_name)
  else:
    logger.info('running step `MuSE sump` of the tumor bam: %s' % input_name)
    home_dir = os.path.expanduser('~')
    muse_path = os.path.join(home_dir, 'tools', 'MuSEv1.0rc_submission_c039ffa')
    cmd = [muse_path, 'sump', '-I', muse_call_output_path, '-E', '-O', muse_sump_output_path, '-D', dbsnp_known_snp_sites]
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['muse_call_output'] = muse_call_output_path
    df['muse_sump_output'] = muse_sump_output_path
    unique_key_dict = {'uuid': uuid, 'muse_call_output': muse_call_output_path, 'muse_sump_output': muse_sump_output_path}
    table_name = 'time_mem_MuSE_sump_wxs'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, sample_base + '_MuSE_sump', logger)
    logger.info('completed running `MuSE sump` of the tumor bam: %s' % input_name)
  return muse_sump_output_path
Esempio n. 11
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info('uuid_dir is: %s' % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, 'fastq')
    logger.info('fastq_dir is: %s' % fastq_dir)
    if pipe_util.already_step(fastq_dir, 'fastq', logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, 'tempfq')
        cmd = [
            'bamtofastq',
            'S=%s' % uuid + '.fq', 'filename=' + bam_path,
            'outputdir=' + fastq_dir, 'tryoq=1', 'collate=1',
            'outputperreadgroup=1', 'T=' + tempfq,
            'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY'
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_bamtofastq'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastq', logger)
        logger.info('completed running step `bamtofastq` of: %s' % bam_path)
    return
Esempio n. 12
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger):
            logger.info('already completed step `picard sort` of: %s' % bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path}
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' % bam_name)
    return out_bam_path_list
def PR(uuid, harmonized_IR_bam_path, thread_count, reference_fasta_name, BQSR_table_path, engine, logger):
  PR_dir = os.path.dirname(harmonized_IR_bam_path)
  bam_name = os.path.basename(harmonized_IR_bam_path)
  bam_base, bam_ext = os.path.splitext(bam_name)
  logger.info('PR_dir=%s' % PR_dir)
  step_dir = PR_dir
  out_BQSR_bam = bam_base + '_BQSR' + bam_ext
  BQSR_bam_path = os.path.join(PR_dir, out_BQSR_bam)
  logger.info('BQSR_bam_path=%s' % BQSR_bam_path)
  if pipe_util.already_step(step_dir, bam_name + '_PrintReads', logger):
    logger.info('already completed step `PrintReads` of: %s' % harmonized_IR_bam_path)
  else:
    logger.info('running step `PrintReads` of: %s' % harmonized_IR_bam_path)
    home_dir = os.path.expanduser('~')
    gatk_path = os.path.join(home_dir, 'tools/GenomeAnalysisTK.jar')
    cmd = ['java', '-d64', '-Xmx16G', '-jar', gatk_path, '-nct ' + thread_count, '-T PrintReads', '-R ' + reference_fasta_name, '-I ' + harmonized_IR_bam_path, '-BQSR ' + BQSR_table_path, '-o ' + BQSR_bam_path]
    shell_cmd = ' '.join(cmd)
    output = pipe_util.do_shell_command(shell_cmd, logger)
    df = time_util.store_time(uuid, shell_cmd, output, logger)
    df['BQSR_bam_path'] = BQSR_bam_path
    df['harmonized_IR_bam_path'] = harmonized_IR_bam_path
    df['thread_count'] = thread_count
    table_name = 'time_mem_GATK_PR'
    unique_key_dict = {'uuid': uuid, 'harmonized_IR_bam_path': harmonized_IR_bam_path, 'thread_count': thread_count, 'BQSR_bam_path': BQSR_bam_path}
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, bam_name + '_PrintReads', logger)
    logger.info('completed running step `PrintReads` of: %s' % harmonized_IR_bam_path)
  return BQSR_bam_path
Esempio n. 14
0
def do_guess_encoding(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'guess_' + fastq_base, logger):
        logger.info('already completed step `guess_encoding`: %s' % fastq_path)
    else:
        logger.info('running step `guess encoding` of %s' % fastq_path)
        pipe_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
        guess_path = os.path.join(pipe_dir, 'guess-encoding.py')
        guess_cmd = 'python2 ' + guess_path
        time_cmd = '/usr/bin/time -v ' + guess_cmd + ' -f ' + fastq_path
        proc = subprocess.Popen(time_cmd,
                                shell=True,
                                stderr=subprocess.STDOUT,
                                stdout=subprocess.PIPE)
        output = proc.communicate()[0]
        logger.info('output=%s' % output)
        df = time_util.store_time(uuid, time_cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_guessencoding'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('do_guess_encoding output=%s' % output.decode())
        write_fastq_format(fastq_path, output, logger)
        pipe_util.create_already_step(fastq_dir, 'guess_' + fastq_base, logger)
    return
Esempio n. 15
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info("uuid_dir is: %s" % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, "fastq")
    logger.info("fastq_dir is: %s" % fastq_dir)
    if pipe_util.already_step(fastq_dir, "fastq", logger):
        logger.info("already completed step `bamtofastq` of: %s" % bam_path)
    else:
        logger.info("running step `bamtofastq` of %s: " % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, "tempfq")
        cmd = [
            "bamtofastq",
            "S=%s" % uuid + ".fq",
            "filename=" + bam_path,
            "outputdir=" + fastq_dir,
            "tryoq=1",
            "collate=1",
            "outputperreadgroup=1",
            "T=" + tempfq,
            "exclude=QCFAIL,SECONDARY,SUPPLEMENTARY",
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_bamtofastq"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, "fastq", logger)
        logger.info("completed running step `bamtofastq` of: %s" % bam_path)
    return
Esempio n. 16
0
def bam_mark_duplicates(uuid,bam_path,thread_count,engine,logger):
    merge_dir=os.path.dirname(bam_path)
    merge_parent_dir=os.path.dirname(merge_dir)
    md_dir=os.path.join(merge_parent_dir,'md')
    os.makedirs(md_dir,exist_ok=True)
    logger.info('md_dir=%s' % md_dir)
    step_dir=md_dir
    outbam=os.path.basename(bam_path)
    outbam_path=os.path.join(md_dir,outbam)
    logger.info('outbam_path=%s' % outbam_path)
    if pipe_util.already_step(step_dir,'markduplicates',logger):
        logger.info('already completed step `markduplicates` of: %s' % bam_path)
    else:
        logger.info('running step `merge of: %s' % bam_path)
        tmpfile=os.path.join(md_dir,'tmpfile_md')
        cmd=['bammarkduplicates2','markthreads='+thread_count,'rmdup=0','md5=1','index=1','level=-1','tmpfile='+tmpfile,'I='+bam_path,'O='+outbam_path]
        output=pipe_util.do_command(cmd,logger)

        #store time/mem to db
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bam_path']=bam_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'thread_count':thread_count}
        table_name='time_mem_bammarkduplicates2'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'markduplicates',logger)
        logger.info('completed running step `markduplicates` of: %s' % bam_path)
    return outbam_path
Esempio n. 17
0
def bam_merge(uuid,bam_path,bam_path_list,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    sort_dir=os.path.dirname(bam_path_list[0])
    sort_parent_dir=os.path.dirname(sort_dir)
    merge_dir=os.path.join(sort_parent_dir,'merge')
    os.makedirs(merge_dir,exist_ok=True)
    step_dir=merge_dir
    outbam=os.path.basename(bam_path)
    outbam_path=os.path.join(merge_dir,outbam)
    logger.info('bam_path_list=%s' % bam_path_list)
    if pipe_util.already_step(step_dir,'merge',logger):
        logger.info('already completed step `merge` of: %s' % bam_path)
    else:
        logger.info('running step `merge of: %s' % bam_path)
        tmpfile=os.path.join(merge_dir,'tmpfile')
        cmd=['bammerge','SO=coordinate','level=-1','tmpfile='+tmpfile,'index=1']
        for input_bam in bam_path_list:
            input_string='I='+input_bam
            cmd.append(input_string)
        output=pipe_util.do_stdout_command(cmd,logger,stdout=outbam_path)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bam_path']=bam_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_bammerge'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'merge',logger)
        logger.info('completed running step `merge` of: %s' % bam_path)
    return outbam_path
Esempio n. 18
0
def bwa_aln_paired(bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(se_realn_dir,outbam)
    if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger):
        logger.info('already completed step `bwa aln paired` of: %s' % bam_path)
    else:
        os.makedirs(pe_realn_dir,exist_ok=True)
        f1_path=os.path.join(fastq_dir,read1)
        f2_path=os.path.join(fastq_dir,read2)
        sai1=fastqbasename+'_1.sai'
        sai2=fastqbasename+'_2.sai'
        sai1_path=os.path.join(pe_realn_dir,sai1)
        sai2_path=os.path.join(pe_realn_dir,sai2)
        bwa_aln_cmd1=['bwa','aln','-t 24',reference_fasta_path,f1_path]
        bwa_aln_cmd2=['bwa','aln','-t 24',reference_fasta_path,f2_path]
        sai1_open=open(sai1_path,'wb')
        pipe_util.do_command(bwa_aln_cmd1,logger,stdout=sai1_open,stderr=subprocess.PIPE)
        sai1_open.close()
        sai2_open=open(sai2_path,'wb')
        pipe_util.do_command(bwa_aln_cmd2,logger,stdout=sai2_open,stderr=subprocess.PIPE)
        sai2_open.close()
        bwa_aln_sampe_cmd=['bwa','sampe','-a 500',reference_fasta_path,sai1_path,sai2_path,f1_path,f2_path]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        cmdlist=list()
        cmdlist.append(bwa_aln_sampe_cmd)
        cmdlist.append(samtools_cmd)
        pipe_util.do_piped_commands(cmdlist,logger)
        pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger)
    return outbam_path
Esempio n. 19
0
def run_hc(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger):
    vcf_dir=os.path.join(scratch_dir,uuid,'hc')
    os.makedirs(vcf_dir,exist_ok=True)
    logger.info('hc vcf_dir=%s' % vcf_dir)
    bamname=os.path.basename(bam_path)
    bambase,bamext=os.path.splitext(bamname)
    outvcf=bambase+'.vcf'
    vcf_path=os.path.join(vcf_dir,outvcf)
    logger.info('vcf_path=%s' % vcf_path)
    home_dir=os.path.expanduser('~')
    if pipe_util.already_step(vcf_dir,'hc_'+bambase,logger):
        logger.info('already completed step `HaplotypeCaller` of: %s' % bam_path)
    else:
        #do work
        gatk_path=os.path.join(home_dir,'bin','GenomeAnalysisTK.jar')
        tmp_dir=os.path.join(scratch_dir,'tmp')
        shellcmd='java -d64 -Djava.io.tmpdir='+tmp_dir+' -jar '+gatk_path+' --analysis_type HaplotypeCaller --generate_md5 -nct '+thread_count+' --output_mode EMIT_VARIANTS_ONLY --input_file ' + bam_path + ' --reference_sequence ' + reference_fasta_path+' --out '+vcf_path
        #+' -L "1:500000-900000"'
        logger.info('shellcmd=%s' % shellcmd)
        cmd=shlex.split(shellcmd)
        logger.info('cmd=%s' % cmd)
        output=pipe_util.do_command(cmd,logger)
        #store timing/mem results in db. uuid+vcf_path are unique key
        df=time_util.store_time(uuid,cmd,output,logger)
        df['vcf_path']=vcf_path
        logger.info('df=%s' % df)
        table_name='time_mem_gatk_hc' #variable, consider making a parameter
        unique_key_dict={'uuid':uuid,'vcf_path':vcf_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        
        #done flag
        pipe_util.create_already_step(vcf_dir,'hc_'+bambase,logger)
    return
Esempio n. 20
0
def do_picard_collectwgsmetrics(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    home_dir=os.path.expanduser('~')
    picard_dir=os.path.join(home_dir,'tools','picard-tools')
    stats_outfile='picard_collectwgsmetrics_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics',logger):
        logger.info('already completed step `picard_collectwgsmetrics` of: %s' % bam_path)
    else:
        logger.info('running step `picard_collectwgsmetrics` of: %s' % bam_path)
        cmd=['java','-d64','-jar',os.path.join(picard_dir,'picard.jar'),'CollectWgsMetrics','INPUT='+bam_path,'OUTPUT='+stats_path,'REFERENCE_SEQUENCE='+reference_fasta_path,'INCLUDE_BQ_HISTOGRAM=true','VALIDATION_STRINGENCY=LENIENT']
        picard_cwgsm_output=pipe_util.do_command(cmd,logger)

        
        #with open(stats_path,'w') as stats_path_open:
        #    for aline in stats_output.decode().format():
        #        stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,picard_cwgsm_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_picard_cwgsm'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics',logger)
        logger.info('completed running step `picard_collectwgsmetrics` of: %s' % bam_path)


    #save stats to db
    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics_db',logger):
        logger.info('already stored `picard collectwgsmetrics` of %s to db' % bam_path)
    else:
        data_dict=picard_wgs_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='picard_collectwgsmetrics'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics_db',logger)
        logger.info('completed storing `picard collectwgsmetrics` to db')
    return
Esempio n. 21
0
def do_samtools_stats(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    stats_outfile='stats_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'samtools_stats',logger):
        logger.info('already completed step `samtools stats` of: %s' % bam_path)
    else:
        logger.info('running step `samtools stats` of: %s' % bam_path)
        cmd=['samtools','stats',bam_path]
        stats_output=pipe_util.do_command(cmd,logger)
        with open(stats_path,'w') as stats_path_open:
            for aline in stats_output.decode().format():
                stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,stats_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_samtools_stats'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats',logger)
        logger.info('completed running step `samtools stats` of: %s' % bam_path)


        
    #save stats to db
    if pipe_util.already_step(step_dir,'samtools_stats_db',logger):
        logger.info('already stored `samtools stats` of %s to db' % bam_path)
    else:
        data_dict=samtools_stats_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='samtools_stats'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats_db',logger)
        logger.info('completed storing `samtools stats` to db')
    return
Esempio n. 22
0
def pull_cgquery_xml_to_file(uuid,outputxml,logger):
    file_dir=os.path.dirname(outputxml)
    if pipe_util.already_step(file_dir,'cgquery_xml',logger):
        logger.info('already completed step `cgquery` of: %s' % uuid)
        return
    else:
        logger.info('running command `cgquery` of: %s' % uuid)
        cmd=['cgquery','-a','analysis_id='+uuid,'-o',outputxml]
        output=pipe_util.do_command(cmd,logger)
        pipe_util.create_already_step(file_dir,'cgquery_xml',logger)
        return
Esempio n. 23
0
def write_readgroups(uuid,bam_path,engine,logger):
    bam_dir=os.path.dirname(bam_path)
    samfile=pysam.AlignmentFile(bam_path,'rb')
    readgroups=samfile.header['RG']
    readgroup_path_dict=dict()
    for readgroup in readgroups:
        rg_id=readgroup['ID']
        outfile=rg_id+'.RG'
        outfile_path=os.path.join(bam_dir,outfile)
        readgroup_path_dict[rg_id]=outfile_path
        if pipe_util.already_step(bam_dir,readgroup['ID']+'_rg_file',logger):
            logger.info('already wrote @RG to: %s' % outfile_path)
        else:
            outfile_open=open(outfile_path,'w')
            outstring='@RG'
            for rg_key in sorted(readgroup.keys()):
                outstring+='\\t'+rg_key+':'+readgroup[rg_key]
            outfile_open.write(outstring)
            outfile_open.close()
            pipe_util.create_already_step(bam_dir,readgroup['ID']+'_rg_file',logger)
    logger.info('readgroup_path_dict=%s' % readgroup_path_dict)

    #store @RG to db
    for readgroup in readgroups:
        if pipe_util.already_step(bam_dir,readgroup['ID']+'_rg_db',logger):
            logger.info('already wrote %s to db' % readgroup['ID'])
        else:
            readgroup['uuid']=[uuid]#or 'ValueError: If using all scalar values, you must pass an index'
            table_name='readgroups'
            for rg_key in sorted(readgroup.keys()):
                rg_dict=dict()
                rg_dict['uuid']=[uuid]
                rg_dict['ID']=readgroup['ID']
                rg_dict['key']=rg_key
                rg_dict['value']=readgroup[rg_key]
                df=pd.DataFrame(rg_dict)
                unique_key_dict={'uuid':uuid,'ID':readgroup['ID'],'key':rg_key}
                df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
                pipe_util.create_already_step(bam_dir,readgroup['ID']+'_rg_db',logger)
            logger.info('completed storing @RG %s to db' % readgroup['ID'])
    return readgroup_path_dict
Esempio n. 24
0
def bwa_mem_single(
    uuid, bam_path, fastq_dir, read1, realn_dir, readkey, reference_fasta_path, rg_str, thread_count, engine, logger
):
    se_realn_dir = os.path.join(realn_dir, "bwa_mem_" + readkey)
    logger.info("se_realn_dir=%s" % se_realn_dir)
    logger.info("read1=%s" % read1)
    fastqbasename = read1.replace("_" + readkey + ".fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(se_realn_dir, outbam)
    if pipe_util.already_step(se_realn_dir, readkey + "_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem single` of: %s" % bam_path)
    else:
        os.makedirs(se_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        bwa_cmd = [
            "bwa",
            "mem",
            "-t " + thread_count,
            "-p",
            "-T 0",
            "-R " + '"' + rg_str + '"',
            reference_fasta_path,
            f1,
        ]
        # samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        samtools_cmd = "samtools view -Shb -o " + outbam_path + " -"
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd
        # shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(shlex_bwa_cmd)
        # cmdlist.append(shlex_samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_se"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `bwa mem single` of: %s" % bam_path)
        pipe_util.create_already_step(se_realn_dir, readkey + "_" + fastqbasename, logger)
    return outbam_path
Esempio n. 25
0
def bam_to_fastq(uuid,bam_path,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    logger.info('uuid_dir is: %s' % uuid_dir)
    fastq_dir=os.path.join(uuid_dir,'fastq')
    logger.info('fastq_dir is: %s' % fastq_dir)
    if pipe_util.already_step(fastq_dir,'fastq',logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        os.makedirs(fastq_dir,exist_ok=True)
        tempfq=os.path.join(fastq_dir,'tempfq')
        cmd=['bamtofastq','filename='+bam_path,'outputdir='+fastq_dir,'tryoq=1','collate=1','outputperreadgroup=1','T='+tempfq]
        pipe_util.do_command(cmd,logger)
        pipe_util.create_already_step(fastq_dir,'fastq',logger)
    return
Esempio n. 26
0
def fastqc_to_db(uuid, fq_path, engine, logger):
    fastq_name = os.path.basename(fq_path)
    fastq_dir = os.path.dirname(fq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    fastq_base = fastq_base.rstrip('.fq')
    qc_report_dir = os.path.join(fastq_dir, fastq_base + '_fastqc')
    fastqc_data_path = os.path.join(qc_report_dir, 'fastqc_data.txt')
    fastqc_summary_path = os.path.join(qc_report_dir, 'summary.txt')
    if pipe_util.already_step(fastq_dir, 'fastqc_db_' + fastq_base, logger):
        logger.info('already completed step `fastqc db`: %s' % fq_path)
    else:
        logger.info('writing `fastqc db`: %s' % fq_path)
        summary_dict = dict()
        summary_dict['uuid'] = [uuid]
        summary_dict['fastq_name'] = fastq_name
        summary_dict = fastqc_summary_to_dict(summary_dict,
                                              fastqc_summary_path, engine,
                                              logger)
        df = pd.DataFrame(summary_dict)
        table_name = 'fastq_summary'
        unique_key_dict = {'uuid': uuid, 'fastq_name': fastq_name}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)

        data_key_list = [
            '>>Basic Statistics', '>>Per base sequence quality',
            '>>Per tile sequence quality', '>>Per sequence quality scores',
            '>>Per base sequence content', '>>Per sequence GC content',
            '>>Per base N content', '>>Sequence Length Distribution',
            '>>Sequence Duplication Levels', '>>Overrepresented sequences',
            '>>Adapter Content', '>>Kmer Content'
        ]
        for data_key in data_key_list:
            df = fastqc_detail_to_df(uuid, fq_path, fastqc_data_path, data_key,
                                     engine, logger)
            if df is None:
                continue
            table_name = 'fastqc_data_' + '_'.join(
                data_key.lstrip('>>').strip().split(' '))
            logger.info('fastqc_to_db() table_name=%s' % table_name)
            unique_key_dict = {'uuid': uuid, 'fastq_path': fq_path}
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name,
                                          engine, logger)

        pipe_util.create_already_step(fastq_dir, 'fastqc_db_' + fastq_base,
                                      logger)
        logger.info('completed writing `fastqc db`: %s' % fq_path)
    return
Esempio n. 27
0
def bwa_mem_paired(
    uuid, bam_path, fastq_dir, read1, read2, realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger
):
    pe_realn_dir = os.path.join(realn_dir, "bwa_mem_pe")
    logger.info("pe_realn_dir=%s" % pe_realn_dir)
    logger.info("read1=%s" % read1)
    logger.info("read2=%s" % read2)
    fastqbasename = read1.replace("_1.fq", "")
    logger.info("fastqbasename=%s" % fastqbasename)
    outbam = os.path.basename(fastqbasename + ".bam")
    outbam_path = os.path.join(pe_realn_dir, outbam)
    if pipe_util.already_step(pe_realn_dir, "pe_" + fastqbasename, logger):
        logger.info("already completed step `bwa mem paired` of: %s" % bam_path)
    else:
        os.makedirs(pe_realn_dir, exist_ok=True)
        f1 = os.path.join(fastq_dir, read1)
        f2 = os.path.join(fastq_dir, read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        # bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd = ["bwa", "mem", "-t " + thread_count, "-T 0", "-R " + '"' + rg_str + '"', reference_fasta_path, f1, f2]
        samtools_cmd = ["samtools", "view", "-Shb", "-o", outbam_path, "-"]
        shell_bwa_cmd = " ".join(bwa_cmd)
        shell_samtools_cmd = " ".join(samtools_cmd)
        shell_cmd = shell_bwa_cmd + " | " + shell_samtools_cmd

        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        # cmdlist=list()
        # cmdlist.append(bwa_cmd)
        # cmdlist.append(samtools_cmd)
        # output=pipe_util.do_piped_commands(cmdlist,logger)
        output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, output, logger)
        df["bam_path"] = outbam_path
        df["reference_fasta_path"] = reference_fasta_path
        df["thread_count"] = thread_count
        unique_key_dict = {
            "uuid": uuid,
            "bam_path": bam_path,
            "reference_fasta_path": reference_fasta_path,
            "thread_count": thread_count,
        }
        table_name = "time_mem_bwa_mem_pe"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(pe_realn_dir, "pe_" + fastqbasename, logger)
    return outbam_path
Esempio n. 28
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path,
             engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base,
                                  logger):
            logger.info('already completed step `picard sort` of: %s' %
                        bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = [
                'java', '-d64', '-jar',
                os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
                'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam,
                'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path,
                'CREATE_INDEX=true',
                'REFERENCE_SEQUENCE=' + reference_fasta_path
            ]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {
                'uuid': uuid,
                'bam_path': outbam_path,
                'reference_fasta_path': reference_fasta_path
            }
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name,
                                          engine, logger)
            pipe_util.create_already_step(outdir_path,
                                          'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' %
                        bam_name)
    return out_bam_path_list
Esempio n. 29
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract', fastq_path] # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base, logger)
    return
Esempio n. 30
0
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger,
              be_lenient):
    sorted_bam_dir = os.path.dirname(bam_path_list[0])
    bwa_alignment_dir = os.path.dirname(sorted_bam_dir)
    realn_dir = os.path.dirname(bwa_alignment_dir)
    out_dir = os.path.join(realn_dir, 'merge')
    os.makedirs(out_dir, exist_ok=True)
    step_dir = out_dir
    preharmbam = os.path.basename(preharmonize_bam_path)
    preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam)
    outbam_name = preharmbam_name + '_gdc_realn.bam'
    outbam_path = os.path.join(out_dir, outbam_name)
    logger.info('bam_path_list=%s' % bam_path_list)
    lenient_merge = False
    if pipe_util.already_step(step_dir, 'picard_merge', logger):
        logger.info('already completed step `merge` of: %s' % outbam_path)
    else:
        logger.info('running step `picard merge of: %s' % outbam_path)
        #tmpfile=os.path.join(merge_dir,'tmpfile')
        home_dir = os.path.expanduser('~')
        cmd = [
            'java', '-d64', '-jar',
            os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
            'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true',
            'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path,
            'TMP_DIR=' + out_dir
        ]
        for input_bam in bam_path_list:
            input_string = 'INPUT=' + input_bam
            cmd.append(input_string)
        if be_lenient:
            cmd.append('VALIDATION_STRINGENCY=LENIENT')
        output = pipe_util.do_command(cmd, logger)

        #save time/mem to db
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = outbam_path
        unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name}
        table_name = 'time_mem_picard_bam_merge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir, 'picard_merge', logger)
        logger.info('completed running step `picard merge` of: %s' %
                    outbam_path)
    return outbam_path
Esempio n. 31
0
def get_fastq_size(fq_path, logger):
    fq_dir = os.path.dirname(fq_path)
    fq_name = os.path.basename(fq_path)
    size_file = os.path.join(fq_dir, fq_name + '.size')
    if pipe_util.already_step(fq_dir, fq_name + 'size', logger):
        with open(size_file, 'r') as size_open:
            size_str = size_open.readline()
        size_value = int(size_str)
        logger.info('already determined size of fq %s: %s' % (fq_name, str(size_value)))
        return size_value
    else:
        logger.info('determining size of fq %s' % fq_name)
        size_value = os.path.getsize(fq_path)
        with open(size_file, 'w') as size_open:
            size_open.write(str(size_value))
        pipe_util.create_already_step(fq_dir, fq_name + 'size', logger)
        logger.info('determined size of fq %s: %s' % (fq_name, str(size_value)))
        return size_value
Esempio n. 32
0
def store_md5_size(uuid,file_path,engine,logger):
    file_dir=os.path.dirname(file_path)
    file_name=os.path.basename(file_path)
    if pipe_util.already_step(file_dir,file_name+'_store_md5_size',logger):
        logger.info('already_completed step store md5_size of: %s' % file_path)
    else:
        file_md5=get_file_md5(uuid,file_path,engine,logger)
        file_size=get_file_size(uuid,file_path,engine,logger)
        df=pd.DataFrame({'uuid':[uuid],
                         'file_path':file_path,
                         'file_size':file_size,
                         'file_md5':file_md5})
        logger.info('df=%s' % df)
        table_name='file_size_md5'
        unique_key_dict={'uuid':uuid,'file_path':file_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(file_dir,file_name+'_store_md5_size',logger)
    return
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + '.bai'
    if pipe_util.already_step(out_dir, bam_name + '_index', logger):
        logger.info('already completed step `samtools index` of %s' % bam_path)
    else:
        logger.info('running step `samtools index` of %s' % bam_path)
        cmd = ['samtools', 'index', bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_samtools_index'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running `samtools index` of %s' % bam_path)
    return bai_path
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + ".bai"
    if pipe_util.already_step(out_dir, bam_name + "_index", logger):
        logger.info("already completed step `samtools index` of %s" % bam_path)
    else:
        logger.info("running step `samtools index` of %s" % bam_path)
        cmd = ["samtools", "index", bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_samtools_index"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running `samtools index` of %s" % bam_path)
    return bai_path
def samtools_faidx(uuid, reference_fasta_name, engine, logger):
    ref_file = os.path.basename(reference_fasta_name)
    fai_path = reference_fasta_name + ".fai"
    out_dir = os.path.dirname(reference_fasta_name)
    if pipe_util.already_step(out_dir, ref_file + "_faidx", logger):
        logger.info("already completed step `samtools faidx` of %s" % reference_fasta_name)
    else:
        logger.info("running step `samtools faidx` of %s" % reference_fasta_name)
        cmd = ["samtools", "faidx", reference_fasta_name]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["fai_path"] = fai_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "fai_path": fai_path}
        table_name = "time_mem_samtools_faidx"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, ref_file + "_faidx", logger)
        logger.info("completed running `samtools faidx` of %s" % reference_fasta_name)
    return fai_path
def picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if os.path.isfile(sd_file_path):
        logger.info("reference_dict_path=%s" % sd_file_path)
    else:
        sd_file_path = picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger)
        logger.info("reference_dict_path=%s" % sd_file_path)
    srt_dir = os.path.dirname(muse_vcf)
    vcf_name = os.path.basename(muse_vcf)
    vcf_base, vcf_ext = os.path.splitext(vcf_name)
    srt_vcf = vcf_base + ".srt" + vcf_ext
    srt_vcf_path = os.path.join(srt_dir, srt_vcf)
    if pipe_util.already_step(srt_dir, vcf_name + "_sorted", logger):
        logger.info("already completed step `Picard SortVcf` of %s" % muse_vcf)
    else:
        logger.info("running step `Picard SortVcf` of %s" % muse_vcf)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "SortVcf",
            "I=" + muse_vcf,
            "O=" + srt_vcf_path,
            "SD=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["MuSE_VCF"] = muse_vcf
        df["MuSE_sorted_VCF"] = srt_vcf_path
        unique_key_dict = {"uuid": uuid, "MuSE_VCF": muse_vcf, "MuSE_sorted_VCF": srt_vcf_path}
        table_name = "time_mem_picard_SortVcf"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard SortVcf` of %s" % muse_vcf)
        pipe_util.create_already_step(srt_dir, vcf_name + "_sorted", logger)
    return srt_vcf_path
Esempio n. 37
0
def bwa_mem_paired(uuid,bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,
                   rg_str,thread_count,engine,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_mem_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(pe_realn_dir,outbam)
    if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger):
        logger.info('already completed step `bwa mem paired` of: %s' % bam_path)
    else:
        os.makedirs(pe_realn_dir,exist_ok=True)
        f1=os.path.join(fastq_dir,read1)
        f2=os.path.join(fastq_dir,read2)
        ##bwa_cmd='bwa mem -t '+thread_count+' -p -T 0 -R '+rg_str+' '+reference_fasta_path+' '+f1+' '+f2
        ##shlex_bwa_cmd=shlex.split(bwa_cmd)
        #bwa_cmd=['bwa','mem','-t 24','-T 0','-R',rg_str,reference_fasta_path,f1,f2]
        bwa_cmd=['bwa','mem','-t '+thread_count,'-T 0','-R '+'"'+rg_str+'"',reference_fasta_path,f1,f2]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        shell_bwa_cmd=' '.join(bwa_cmd)
        shell_samtools_cmd=' '.join(samtools_cmd)
        shell_cmd=shell_bwa_cmd+' | '+shell_samtools_cmd
        
        ##samtools_cmd='samtools view -Shb -o '+outbam_path+' -'
        ##shlex_samtools_cmd=shlex.split(samtools_cmd)
        #cmdlist=list()
        #cmdlist.append(bwa_cmd)
        #cmdlist.append(samtools_cmd)
        #output=pipe_util.do_piped_commands(cmdlist,logger)
        output=pipe_util.do_shell_command(shell_cmd,logger)
        df=time_util.store_time(uuid,shell_cmd,output,logger)
        df['bam_path']=outbam_path
        df['reference_fasta_path']=reference_fasta_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                         'thread_count':thread_count}
        table_name='time_mem_bwa_mem_pe'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger)
    return outbam_path
Esempio n. 38
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract',
               fastq_path]  # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base,
                                      logger)
    return
Esempio n. 39
0
def get_fastq_size(fq_path, logger):
    fq_dir = os.path.dirname(fq_path)
    fq_name = os.path.basename(fq_path)
    size_file = os.path.join(fq_dir, fq_name + '.size')
    if pipe_util.already_step(fq_dir, fq_name + 'size', logger):
        with open(size_file, 'r') as size_open:
            size_str = size_open.readline()
        size_value = int(size_str)
        logger.info('already determined size of fq %s: %s' %
                    (fq_name, str(size_value)))
        return size_value
    else:
        logger.info('determining size of fq %s' % fq_name)
        size_value = os.path.getsize(fq_path)
        with open(size_file, 'w') as size_open:
            size_open.write(str(size_value))
        pipe_util.create_already_step(fq_dir, fq_name + 'size', logger)
        logger.info('determined size of fq %s: %s' %
                    (fq_name, str(size_value)))
        return size_value
Esempio n. 40
0
def write_readgroups(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    if pipe_util.already_step(step_dir, 'readgroups', logger):
        logger.info('already extracted readgroups from %s' % bam_path)
        readgroup_path_list = glob.glob(os.path.join(step_dir, '*.RG'))
        readgroup_path_dict = dict()
        for readgroup_path in readgroup_path_list:
            readgroup_file = os.path.basename(readgroup_path)
            readgroup = readgroup_file.rstrip('.RG')
            readgroup_path_dict[readgroup] = readgroup_path
        return readgroup_path_dict
    else:
        logger.info('extracting readgroups from %s' % bam_path)
        bam_dir = os.path.dirname(bam_path)
        samfile = pysam.AlignmentFile(bam_path, 'rb')
        header = samfile.text
        header_list = header.split('\n')
        header_rg_list = [
            header_line for header_line in header_list
            if header_line.startswith('@RG')
        ]
        readgroups = header_rg_list_to_rg_dicts(header_rg_list)

        readgroup_path_dict = dict()
        for readgroup in readgroups:
            rg_id = readgroup['ID']
            outfile = rg_id + '.RG'
            outfile_path = os.path.join(bam_dir, outfile)
            readgroup_path_dict[rg_id] = outfile_path
            if pipe_util.already_step(bam_dir, readgroup['ID'] + '_rg_file',
                                      logger):
                logger.info('already wrote @RG to: %s' % outfile_path)
            else:
                outfile_open = open(outfile_path, 'w')
                outstring = '@RG'
                for rg_key in sorted(readgroup.keys()):
                    outstring += '\\t' + rg_key + ':' + readgroup[rg_key]
                outfile_open.write(outstring)
                outfile_open.close()
                pipe_util.create_already_step(bam_dir,
                                              readgroup['ID'] + '_rg_file',
                                              logger)
        logger.info('readgroup_path_dict=%s' % readgroup_path_dict)
        pipe_util.create_already_step(step_dir, 'readgroups', logger)
        logger.info('completed extracting readgroups from %s' % bam_path)

    # Store @RG to db
    if pipe_util.already_step(step_dir, 'readgroups_db', logger):
        logger.info('already stored readgroups of %s to db' % bam_path)
    else:
        logger.info('storing readgroups of %s to db' % bam_path)
        for readgroup in readgroups:
            if pipe_util.already_step(bam_dir, readgroup['ID'] + '_rg_db',
                                      logger):
                logger.info('already wrote %s to db' % readgroup['ID'])
            else:
                readgroup['uuid'] = [uuid]
                table_name = 'readgroups'
                for rg_key in sorted(readgroup.keys()):
                    rg_dict = dict()
                    rg_dict['uuid'] = [uuid]
                    rg_dict['ID'] = readgroup['ID']
                    rg_dict['value'] = readgroup[rg_key]
                    df = pd.DataFrame(rg_dict)
                    unique_key_dict = {
                        'uuid': uuid,
                        'ID': readgroup['ID'],
                        'key': rg_key
                    }
                    df_util.save_df_to_sqlalchemy(df, unique_key_dict,
                                                  table_name, engine, logger)
                    pipe_util.create_already_step(bam_dir,
                                                  readgroup['ID'] + '_rg_db',
                                                  logger)
                logger.info('completed storing @RG %s to db' % readgroup['ID'])
        return readgroup_path_dict
Esempio n. 41
0
def do_samtools_flagstat(uuid, bam_path, reference_fasta_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    bam_base, bam_ext = os.path.splitext(bam_name)
    flagstat_outfile = 'samtools_flagstat_' + bam_base + '.txt'
    flagstat_path = os.path.join(step_dir, flagstat_outfile)

    if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base,
                              logger):
        logger.info('already completed step `samtools flagstat of: %s' %
                    bam_path)
    else:
        logger.info('running step stat of: %s' % bam_path)
        cmd = ['samtools', 'flagstat', bam_path]
        flagstat_output = pipe_util.do_command(cmd, logger)
        with open(flagstat_path, 'w') as flagstat_path_open:
            for aline in flagstat_output.decode().format():
                flagstat_path_open.write(aline)
        #save time/mem to db
        df = time_util.store_time(uuid, cmd, flagstat_output, logger)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        table_name = 'time_mem_samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base, logger)
        logger.info('completed running step `samtools flagstat` of: %s' %
                    bam_path)

    #save stats to db
    if pipe_util.already_step(step_dir,
                              'samtools_flagstat_' + bam_base + '_db', logger):
        logger.info('already stored `samtools flagstat` of %s to db' %
                    bam_path)
    else:
        data_dict = samtools_flagstat_to_dict(uuid, bam_path, flagstat_path,
                                              logger)
        data_dict['uuid'] = [uuid]
        data_dict['bam_path'] = bam_path
        data_dict['reference_fasta_path'] = reference_fasta_path
        df = pd.DataFrame(data_dict)
        table_name = 'samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base + '_db',
                                      logger)
        logger.info('completed storing `samtools flagstat` of %s to db' %
                    bam_path)
    return
Esempio n. 42
0
def bwa_aln_single(uuid, bam_path, fastq_dir, read1, realn_dir, readkey,
                   reference_fasta_path, rg_str, fastq_encoding, engine,
                   logger):
    se_realn_dir = os.path.join(realn_dir, 'bwa_aln_' + readkey)
    logger.info('se_realln_dir=%s' % se_realn_dir)
    logger.info('read1=%s' % read1)
    fastqbasename = read1.replace('_' + readkey + '.fq', '')
    logger.info('fastqbasename=%s' % fastqbasename)
    outsai = os.path.basename(fastqbasename + '.sai')
    outbam = os.path.basename(fastqbasename + '.bam')
    outsai_path = os.path.join(se_realn_dir, outsai)
    outbam_path = os.path.join(se_realn_dir, outbam)
    read1_name, read1_ext = os.path.splitext(read1)
    sai1_name = read1_name + '.sai'
    sai1_path = os.path.join(se_realn_dir, sai1_name)
    f1 = os.path.join(fastq_dir, read1)
    os.makedirs(se_realn_dir, exist_ok=True)

    # BWA ALN Command
    if pipe_util.already_step(se_realn_dir, readkey + '_sai_' + fastqbasename,
                              logger):
        logger.info('already completed step `bwa aln` of: %s' % read1)
    else:
        aln_frontend = ['bwa', 'aln', reference_fasta_path, f1]

        if fastq_encoding == 'Illumina-1.8' or fastq_encoding == 'Sanger / Illumina 1.9':
            logger.info('%s is fastq_encoding, so use `bwa aln`' %
                        fastq_encoding)
        elif fastq_encoding == 'Illumina-1.3' or fastq_encoding == 'Illumina-1.5' or fastq_encoding == 'Illumina-1.5-HMS':
            logger.info('%s is fastq_encoding, so use `bwa aln -I`' %
                        fastq_encoding)
            aln_frontend.insert(3, '-I')
        else:
            logger.info('unhandled fastq_encoding: %s' % fastq_encoding)
            sys.exit(1)

        aln_backend = [' > ', outsai_path]
        aln_cmd = aln_frontend + aln_backend
        shell_aln_cmd = ' '.join(aln_cmd)
        aln_output = pipe_util.do_shell_command(shell_aln_cmd, logger)
        df = time_util.store_time(uuid, shell_aln_cmd, aln_output, logger)
        df['sai_path'] = outsai_path
        df['reference_fasta_path'] = reference_fasta_path
        # df['thread_count'] = thread_count
        unique_key_dict = {
            'uuid': uuid,
            'sai_path': outsai_path,
            'reference_fasta_path': reference_fasta_path
        }  # 'thread_count': thread_count}
        table_name = 'time_mem_bwa_aln'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single aln` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_sai_' + fastqbasename,
                                      logger)

    # BWA SAMSE Command
    if pipe_util.already_step(se_realn_dir,
                              readkey + '_samse_' + fastqbasename, logger):
        logger.info('already completed set `bwa samse` of %s:' % outbam_path)
    else:
        if rg_str is None:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path, outsai_path, f1
            ]
        else:
            samse_cmd = [
                'bwa', 'samse', '-n 10', reference_fasta_path,
                '-r' + '"' + rg_str + '"', outsai_path, f1
            ]
        samtools_cmd = 'samtools view -Shb -o ' + outbam_path + ' -'
        shell_samse_cmd = ' '.join(samse_cmd)
        shell_cmd = shell_samse_cmd + ' | ' + samtools_cmd
        logger.info('bwa_aln_single() shell_cmd=%s' % shell_cmd)
        samse_output = pipe_util.do_shell_command(shell_cmd, logger)
        df = time_util.store_time(uuid, shell_cmd, samse_output, logger)
        logger.info('bwa_aln_single() df=%s' % df)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': outbam_path,
            'reference_fasta_path': reference_fasta_path
        }
        table_name = 'time_mem_bwa_samse'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        logger.info('completed running step `bwa single samse` of: %s' %
                    bam_path)
        pipe_util.create_already_step(se_realn_dir,
                                      readkey + '_samse_' + fastqbasename,
                                      logger)
    return outbam_path