Beispiel #1
0
def bwa_aln_paired(bam_path,fastq_dir,read1,read2,realn_dir,reference_fasta_path,logger):
    pe_realn_dir=os.path.join(realn_dir,'bwa_aln_pe')
    logger.info('pe_realn_dir=%s' % pe_realn_dir)
    logger.info('read1=%s' % read1)
    logger.info('read2=%s' % read2)
    fastqbasename=read1.replace('_1.fq','')
    logger.info('fastqbasename=%s' % fastqbasename)
    outbam=os.path.basename(fastqbasename+'.bam')
    outbam_path=os.path.join(se_realn_dir,outbam)
    if pipe_util.already_step(pe_realn_dir,'pe_'+fastqbasename,logger):
        logger.info('already completed step `bwa aln paired` of: %s' % bam_path)
    else:
        os.makedirs(pe_realn_dir,exist_ok=True)
        f1_path=os.path.join(fastq_dir,read1)
        f2_path=os.path.join(fastq_dir,read2)
        sai1=fastqbasename+'_1.sai'
        sai2=fastqbasename+'_2.sai'
        sai1_path=os.path.join(pe_realn_dir,sai1)
        sai2_path=os.path.join(pe_realn_dir,sai2)
        bwa_aln_cmd1=['bwa','aln','-t 24',reference_fasta_path,f1_path]
        bwa_aln_cmd2=['bwa','aln','-t 24',reference_fasta_path,f2_path]
        sai1_open=open(sai1_path,'wb')
        pipe_util.do_command(bwa_aln_cmd1,logger,stdout=sai1_open,stderr=subprocess.PIPE)
        sai1_open.close()
        sai2_open=open(sai2_path,'wb')
        pipe_util.do_command(bwa_aln_cmd2,logger,stdout=sai2_open,stderr=subprocess.PIPE)
        sai2_open.close()
        bwa_aln_sampe_cmd=['bwa','sampe','-a 500',reference_fasta_path,sai1_path,sai2_path,f1_path,f2_path]
        samtools_cmd=['samtools','view','-Shb','-o',outbam_path,'-']
        cmdlist=list()
        cmdlist.append(bwa_aln_sampe_cmd)
        cmdlist.append(samtools_cmd)
        pipe_util.do_piped_commands(cmdlist,logger)
        pipe_util.create_already_step(pe_realn_dir,'pe_'+fastqbasename,logger)
    return outbam_path
Beispiel #2
0
def bam_to_fastq(uuid,bam_path,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    logger.info('uuid_dir is: %s' % uuid_dir)
    fastq_dir=os.path.join(uuid_dir,'fastq')
    logger.info('fastq_dir is: %s' % fastq_dir)
    if pipe_util.already_step(fastq_dir,'fastq',logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        os.makedirs(fastq_dir,exist_ok=True)
        tempfq=os.path.join(fastq_dir,'tempfq')
        cmd=['bamtofastq','filename='+bam_path,'outputdir='+fastq_dir,'tryoq=1','collate=1','outputperreadgroup=1','T='+tempfq]
        pipe_util.do_command(cmd,logger)
        pipe_util.create_already_step(fastq_dir,'fastq',logger)
    return
def picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if pipe_util.already_step(sd_dir, ref_name + "_dict", logger):
        logger.info("already completed step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
    else:
        logger.info("running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "CreateSequenceDictionary",
            "R=" + reference_fasta_name,
            "O=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["sequence_dictionary"] = sd_file_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "sequence_dictionary": sd_file_path}
        table_name = "time_mem_picard_CreateSequenceDictionary"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard CreateSequenceDictionary` of %s" % reference_fasta_name)
        pipe_util.create_already_step(sd_dir, ref_name + "_dict", logger)
    return sd_file_path
Beispiel #4
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info('uuid_dir is: %s' % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, 'fastq')
    logger.info('fastq_dir is: %s' % fastq_dir)
    if pipe_util.already_step(fastq_dir, 'fastq', logger):
        logger.info('already completed step `bamtofastq` of: %s' % bam_path)
    else:
        logger.info('running step `bamtofastq` of %s: ' % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, 'tempfq')
        cmd = [
            'bamtofastq',
            'S=%s' % uuid + '.fq', 'filename=' + bam_path,
            'outputdir=' + fastq_dir, 'tryoq=1', 'collate=1',
            'outputperreadgroup=1', 'T=' + tempfq,
            'exclude=QCFAIL,SECONDARY,SUPPLEMENTARY'
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_bamtofastq'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastq', logger)
        logger.info('completed running step `bamtofastq` of: %s' % bam_path)
    return
Beispiel #5
0
def bam_mark_duplicates(uuid,bam_path,thread_count,engine,logger):
    merge_dir=os.path.dirname(bam_path)
    merge_parent_dir=os.path.dirname(merge_dir)
    md_dir=os.path.join(merge_parent_dir,'md')
    os.makedirs(md_dir,exist_ok=True)
    logger.info('md_dir=%s' % md_dir)
    step_dir=md_dir
    outbam=os.path.basename(bam_path)
    outbam_path=os.path.join(md_dir,outbam)
    logger.info('outbam_path=%s' % outbam_path)
    if pipe_util.already_step(step_dir,'markduplicates',logger):
        logger.info('already completed step `markduplicates` of: %s' % bam_path)
    else:
        logger.info('running step `merge of: %s' % bam_path)
        tmpfile=os.path.join(md_dir,'tmpfile_md')
        cmd=['bammarkduplicates2','markthreads='+thread_count,'rmdup=0','md5=1','index=1','level=-1','tmpfile='+tmpfile,'I='+bam_path,'O='+outbam_path]
        output=pipe_util.do_command(cmd,logger)

        #store time/mem to db
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bam_path']=bam_path
        df['thread_count']=thread_count
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'thread_count':thread_count}
        table_name='time_mem_bammarkduplicates2'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'markduplicates',logger)
        logger.info('completed running step `markduplicates` of: %s' % bam_path)
    return outbam_path
Beispiel #6
0
def bam_to_fastq(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    uuid_dir = step_dir

    logger.info("uuid_dir is: %s" % uuid_dir)
    fastq_dir = os.path.join(uuid_dir, "fastq")
    logger.info("fastq_dir is: %s" % fastq_dir)
    if pipe_util.already_step(fastq_dir, "fastq", logger):
        logger.info("already completed step `bamtofastq` of: %s" % bam_path)
    else:
        logger.info("running step `bamtofastq` of %s: " % bam_path)
        os.makedirs(fastq_dir, exist_ok=True)
        tempfq = os.path.join(fastq_dir, "tempfq")
        cmd = [
            "bamtofastq",
            "S=%s" % uuid + ".fq",
            "filename=" + bam_path,
            "outputdir=" + fastq_dir,
            "tryoq=1",
            "collate=1",
            "outputperreadgroup=1",
            "T=" + tempfq,
            "exclude=QCFAIL,SECONDARY,SUPPLEMENTARY",
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_bamtofastq"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, "fastq", logger)
        logger.info("completed running step `bamtofastq` of: %s" % bam_path)
    return
Beispiel #7
0
def get_file_md5(uuid,file_path,engine,logger):
    file_dir=os.path.dirname(file_path)
    file_name=os.path.basename(file_path)
    file_shortname,file_ext=os.path.splitext(file_name)
    file_md5_name=file_name+'.md5'
    file_md5_path=os.path.join(file_dir,file_md5_name)
    if pipe_util.already_step(file_dir,file_name+'_md5sum',logger):
        logger.info('already completed step `md5sum` of: %s' % file_path)
        with open(file_md5_path,'r') as file_md5_path_open:
            file_md5=file_md5_path_open.readline().strip()
            return file_md5
    else:
        cmd=['md5sum',file_path]
        output=pipe_util.do_command(cmd,logger)
        file_md5=output.split()[0].decode()
        file_md5_path_open=open(file_md5_path,'w')
        file_md5_path_open.write(file_md5)
        file_md5_path_open.close()
        df=time_util.store_time(uuid,cmd,output,logger)
        df['file_path']=file_path
        logger.info('df=%s' % df)
        unique_key_dict={'uuid':uuid,'file_path':file_path}
        table_name='time_mem_md5'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(file_dir,file_name+'_md5sum',logger)
        return file_md5
    return None
Beispiel #8
0
def run_hc(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger):
    vcf_dir=os.path.join(scratch_dir,uuid,'hc')
    os.makedirs(vcf_dir,exist_ok=True)
    logger.info('hc vcf_dir=%s' % vcf_dir)
    bamname=os.path.basename(bam_path)
    bambase,bamext=os.path.splitext(bamname)
    outvcf=bambase+'.vcf'
    vcf_path=os.path.join(vcf_dir,outvcf)
    logger.info('vcf_path=%s' % vcf_path)
    home_dir=os.path.expanduser('~')
    if pipe_util.already_step(vcf_dir,'hc_'+bambase,logger):
        logger.info('already completed step `HaplotypeCaller` of: %s' % bam_path)
    else:
        #do work
        gatk_path=os.path.join(home_dir,'bin','GenomeAnalysisTK.jar')
        tmp_dir=os.path.join(scratch_dir,'tmp')
        shellcmd='java -d64 -Djava.io.tmpdir='+tmp_dir+' -jar '+gatk_path+' --analysis_type HaplotypeCaller --generate_md5 -nct '+thread_count+' --output_mode EMIT_VARIANTS_ONLY --input_file ' + bam_path + ' --reference_sequence ' + reference_fasta_path+' --out '+vcf_path
        #+' -L "1:500000-900000"'
        logger.info('shellcmd=%s' % shellcmd)
        cmd=shlex.split(shellcmd)
        logger.info('cmd=%s' % cmd)
        output=pipe_util.do_command(cmd,logger)
        #store timing/mem results in db. uuid+vcf_path are unique key
        df=time_util.store_time(uuid,cmd,output,logger)
        df['vcf_path']=vcf_path
        logger.info('df=%s' % df)
        table_name='time_mem_gatk_hc' #variable, consider making a parameter
        unique_key_dict={'uuid':uuid,'vcf_path':vcf_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        
        #done flag
        pipe_util.create_already_step(vcf_dir,'hc_'+bambase,logger)
    return
Beispiel #9
0
def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, 'validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        
        cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, 'validate', logger)

    if pipe_util.already_step(step_dir, 'validate_db', logger):
        logger.info('alreaddy stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, 'validate_db', logger)
        logger.info('completed storing `picard validate` to db')
        
                                                    
                        
def sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger):
  sump_dir = os.path.dirname(muse_call_output_path)
  input_name = os.path.basename(muse_call_output_path)
  input_base, input_ext = os.path.splitext(input_name)
  sample_base, sample_ext = os.path.splitext(input_base)
  logger.info('MuSE_sump_dir=%s' % sump_dir)
  step_dir = sump_dir
  muse_sump_output = input_base + '.vcf'
  muse_sump_output_path = os.path.join(sump_dir, muse_sump_output)
  logger.info('muse_sump_output_path=%s' % muse_sump_output_path)
  if pipe_util.already_step(step_dir, sample_base + '_MuSE_sump', logger):
    logger.info('already completed step `MuSE sump` of: %s' % input_name)
  else:
    logger.info('running step `MuSE sump` of the tumor bam: %s' % input_name)
    home_dir = os.path.expanduser('~')
    muse_path = os.path.join(home_dir, 'tools', 'MuSEv1.0rc_submission_c039ffa')
    cmd = [muse_path, 'sump', '-I', muse_call_output_path, '-E', '-O', muse_sump_output_path, '-D', dbsnp_known_snp_sites]
    output = pipe_util.do_command(cmd, logger)
    df = time_util.store_time(uuid, cmd, output, logger)
    df['muse_call_output'] = muse_call_output_path
    df['muse_sump_output'] = muse_sump_output_path
    unique_key_dict = {'uuid': uuid, 'muse_call_output': muse_call_output_path, 'muse_sump_output': muse_sump_output_path}
    table_name = 'time_mem_MuSE_sump_wxs'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    pipe_util.create_already_step(step_dir, sample_base + '_MuSE_sump', logger)
    logger.info('completed running `MuSE sump` of the tumor bam: %s' % input_name)
  return muse_sump_output_path
def bam_validate(uuid, bam_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    validate_file = bam_path + '.validate'
    if pipe_util.already_step(step_dir, bam_name + '_validate', logger):
        logger.info('already completed step `validate` of: %s' % bam_path)
    else:
        logger.info('running step validate of: %s' % bam_path)
        home_dir = os.path.expanduser('~')
        mo = int((2 ** 32) / 2) - 1
        cmd = ['java', '-d64', '-Xmx16G', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'ValidateSamFile', 'MO=' + str(mo), 'INPUT=' + bam_path, 'OUTPUT=' + validate_file]
        output = pipe_util.do_command(cmd, logger, allow_fail=True)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        df['validate_file'] = validate_file
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path, 'validate_file': validate_file}
        table_name = 'time_mem_picard_ValidateSamFile'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running step validate of: %s' % bam_path)
        pipe_util.create_already_step(step_dir, bam_name + '_validate', logger)
        logger.info('completed running step `picard validate` of: %s' % bam_path)
    if pipe_util.already_step(step_dir, bam_name + '_validate_db', logger):
        logger.info('alread stored `picard validate` to db')
    else:
        logger.info('storing `picard validate` to db')
        store_validate_error(uuid, bam_path, validate_file, engine, logger)
        pipe_util.create_already_step(step_dir, bam_name + '_validate_db', logger)
        logger.info('completed storing `picard validate` to db')
Beispiel #12
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path, engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base, logger):
            logger.info('already completed step `picard sort` of: %s' % bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam, 'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path, 'CREATE_INDEX=true', 'REFERENCE_SEQUENCE=' + reference_fasta_path]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {'uuid': uuid, 'bam_path': outbam_path, 'reference_fasta_path': reference_fasta_path}
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
            pipe_util.create_already_step(outdir_path, 'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' % bam_name)
    return out_bam_path_list
Beispiel #13
0
def get_s3_md5(s3_bucket,s3_object,logger):
    s3_path=os.path.join('s3://',s3_bucket,analysis_id,s3_object)
    cmd=['s3cmd','info',s3_path]
    output=pipe_util.do_command(cmd,logger)
    s3_md5=str()
    for line in output:
        if 'MD5' in line:
            md5_s3=line.split(':')[1].strip()
            return md5_s3
    return None
Beispiel #14
0
def pull_cgquery_xml_to_file(uuid,outputxml,logger):
    file_dir=os.path.dirname(outputxml)
    if pipe_util.already_step(file_dir,'cgquery_xml',logger):
        logger.info('already completed step `cgquery` of: %s' % uuid)
        return
    else:
        logger.info('running command `cgquery` of: %s' % uuid)
        cmd=['cgquery','-a','analysis_id='+uuid,'-o',outputxml]
        output=pipe_util.do_command(cmd,logger)
        pipe_util.create_already_step(file_dir,'cgquery_xml',logger)
        return
Beispiel #15
0
def bam_sort(uuid, preharmonized_bam_path, bam_path_list, reference_fasta_path,
             engine, logger, be_lenient):
    out_bam_path_list = list()
    for input_bam in bam_path_list:
        bam_name = os.path.basename(input_bam)
        bam_base, bam_ext = os.path.splitext(bam_name)
        input_dir = os.path.dirname(input_bam)
        outdir_path = os.path.join(input_dir, 'sorted')
        outbam_path = os.path.join(outdir_path, bam_name)
        tmpfile = os.path.join(outdir_path, 'tmpfile_' + bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path, 'picard_sort_' + bam_base,
                                  logger):
            logger.info('already completed step `picard sort` of: %s' %
                        bam_name)
        else:
            logger.info('running step `picard sort` of: %s' % bam_name)
            os.makedirs(outdir_path, exist_ok=True)
            home_dir = os.path.expanduser('~')
            cmd = [
                'java', '-d64', '-jar',
                os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
                'SortSam', 'SORT_ORDER=coordinate', 'INPUT=' + input_bam,
                'OUTPUT=' + outbam_path, 'TMP_DIR=' + outdir_path,
                'CREATE_INDEX=true',
                'REFERENCE_SEQUENCE=' + reference_fasta_path
            ]
            if be_lenient:
                cmd.append('VALIDATION_STRINGENCY=LENIENT')
            output = pipe_util.do_command(cmd, logger)
            df = time_util.store_time(uuid, cmd, output, logger)
            df['bam_path'] = outbam_path
            df['reference_fasta_path'] = reference_fasta_path
            unique_key_dict = {
                'uuid': uuid,
                'bam_path': outbam_path,
                'reference_fasta_path': reference_fasta_path
            }
            table_name = 'time_mem_picard_bamsort'
            df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name,
                                          engine, logger)
            pipe_util.create_already_step(outdir_path,
                                          'picard_sort_' + bam_base, logger)
            logger.info('completed running step `picard sort` of: %s' %
                        bam_name)
    return out_bam_path_list
Beispiel #16
0
def do_picard_collectwgsmetrics(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    home_dir=os.path.expanduser('~')
    picard_dir=os.path.join(home_dir,'tools','picard-tools')
    stats_outfile='picard_collectwgsmetrics_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics',logger):
        logger.info('already completed step `picard_collectwgsmetrics` of: %s' % bam_path)
    else:
        logger.info('running step `picard_collectwgsmetrics` of: %s' % bam_path)
        cmd=['java','-d64','-jar',os.path.join(picard_dir,'picard.jar'),'CollectWgsMetrics','INPUT='+bam_path,'OUTPUT='+stats_path,'REFERENCE_SEQUENCE='+reference_fasta_path,'INCLUDE_BQ_HISTOGRAM=true','VALIDATION_STRINGENCY=LENIENT']
        picard_cwgsm_output=pipe_util.do_command(cmd,logger)

        
        #with open(stats_path,'w') as stats_path_open:
        #    for aline in stats_output.decode().format():
        #        stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,picard_cwgsm_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_picard_cwgsm'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics',logger)
        logger.info('completed running step `picard_collectwgsmetrics` of: %s' % bam_path)


    #save stats to db
    if pipe_util.already_step(step_dir,'picard_collectwgsmetrics_db',logger):
        logger.info('already stored `picard collectwgsmetrics` of %s to db' % bam_path)
    else:
        data_dict=picard_wgs_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='picard_collectwgsmetrics'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'picard_collectwgsmetrics_db',logger)
        logger.info('completed storing `picard collectwgsmetrics` to db')
    return
Beispiel #17
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract', fastq_path] # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base, logger)
    return
Beispiel #18
0
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger,
              be_lenient):
    sorted_bam_dir = os.path.dirname(bam_path_list[0])
    bwa_alignment_dir = os.path.dirname(sorted_bam_dir)
    realn_dir = os.path.dirname(bwa_alignment_dir)
    out_dir = os.path.join(realn_dir, 'merge')
    os.makedirs(out_dir, exist_ok=True)
    step_dir = out_dir
    preharmbam = os.path.basename(preharmonize_bam_path)
    preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam)
    outbam_name = preharmbam_name + '_gdc_realn.bam'
    outbam_path = os.path.join(out_dir, outbam_name)
    logger.info('bam_path_list=%s' % bam_path_list)
    lenient_merge = False
    if pipe_util.already_step(step_dir, 'picard_merge', logger):
        logger.info('already completed step `merge` of: %s' % outbam_path)
    else:
        logger.info('running step `picard merge of: %s' % outbam_path)
        #tmpfile=os.path.join(merge_dir,'tmpfile')
        home_dir = os.path.expanduser('~')
        cmd = [
            'java', '-d64', '-jar',
            os.path.join(home_dir, 'tools/picard-tools/picard.jar'),
            'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true',
            'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path,
            'TMP_DIR=' + out_dir
        ]
        for input_bam in bam_path_list:
            input_string = 'INPUT=' + input_bam
            cmd.append(input_string)
        if be_lenient:
            cmd.append('VALIDATION_STRINGENCY=LENIENT')
        output = pipe_util.do_command(cmd, logger)

        #save time/mem to db
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = outbam_path
        unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name}
        table_name = 'time_mem_picard_bam_merge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir, 'picard_merge', logger)
        logger.info('completed running step `picard merge` of: %s' %
                    outbam_path)
    return outbam_path
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + ".bai"
    if pipe_util.already_step(out_dir, bam_name + "_index", logger):
        logger.info("already completed step `samtools index` of %s" % bam_path)
    else:
        logger.info("running step `samtools index` of %s" % bam_path)
        cmd = ["samtools", "index", bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["bam_path"] = bam_path
        unique_key_dict = {"uuid": uuid, "bam_path": bam_path}
        table_name = "time_mem_samtools_index"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running `samtools index` of %s" % bam_path)
    return bai_path
def samtools_bam_index(uuid, bam_path, engine, logger):
    bam_file = os.path.basename(bam_path)
    bam_name, bam_ext = os.path.splitext(bam_file)
    out_dir = os.path.dirname(bam_path)
    bai_path = bam_path + '.bai'
    if pipe_util.already_step(out_dir, bam_name + '_index', logger):
        logger.info('already completed step `samtools index` of %s' % bam_path)
    else:
        logger.info('running step `samtools index` of %s' % bam_path)
        cmd = ['samtools', 'index', bam_path]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = bam_path
        unique_key_dict = {'uuid': uuid, 'bam_path': bam_path}
        table_name = 'time_mem_samtools_index'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info('completed running `samtools index` of %s' % bam_path)
    return bai_path
Beispiel #21
0
def do_samtools_stats(uuid,bam_path,reference_fasta_path,engine,logger):
    step_dir=os.path.dirname(bam_path)
    bam_name=os.path.basename(bam_path)
    bam_base,bam_ext=os.path.splitext(bam_name)
    stats_outfile='stats_'+bam_base+'.txt'
    stats_path=os.path.join(step_dir,stats_outfile)

    if pipe_util.already_step(step_dir,'samtools_stats',logger):
        logger.info('already completed step `samtools stats` of: %s' % bam_path)
    else:
        logger.info('running step `samtools stats` of: %s' % bam_path)
        cmd=['samtools','stats',bam_path]
        stats_output=pipe_util.do_command(cmd,logger)
        with open(stats_path,'w') as stats_path_open:
            for aline in stats_output.decode().format():
                stats_path_open.write(aline)

        #save time/mem to db
        df=time_util.store_time(uuid,cmd,stats_output,logger)
        df['bam_path']=bam_path
        df['reference_fasta_path']=reference_fasta_path
        unique_key_dict={'uuid':uuid,'bam_path':bam_path}
        table_name='time_mem_samtools_stats'
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats',logger)
        logger.info('completed running step `samtools stats` of: %s' % bam_path)


        
    #save stats to db
    if pipe_util.already_step(step_dir,'samtools_stats_db',logger):
        logger.info('already stored `samtools stats` of %s to db' % bam_path)
    else:
        data_dict=samtools_stats_to_dict(uuid,bam_path,stats_path,logger)
        data_dict['uuid']=[uuid]
        data_dict['bam_path']=bam_path
        data_dict['reference_fasta_path']=reference_fasta_path
        df=pd.DataFrame(data_dict)
        table_name='samtools_stats'
        unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_already_step(step_dir,'samtools_stats_db',logger)
        logger.info('completed storing `samtools stats` to db')
    return
def samtools_faidx(uuid, reference_fasta_name, engine, logger):
    ref_file = os.path.basename(reference_fasta_name)
    fai_path = reference_fasta_name + ".fai"
    out_dir = os.path.dirname(reference_fasta_name)
    if pipe_util.already_step(out_dir, ref_file + "_faidx", logger):
        logger.info("already completed step `samtools faidx` of %s" % reference_fasta_name)
    else:
        logger.info("running step `samtools faidx` of %s" % reference_fasta_name)
        cmd = ["samtools", "faidx", reference_fasta_name]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["reference_fasta"] = reference_fasta_name
        df["fai_path"] = fai_path
        unique_key_dict = {"uuid": uuid, "reference_fasta": reference_fasta_name, "fai_path": fai_path}
        table_name = "time_mem_samtools_faidx"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, ref_file + "_faidx", logger)
        logger.info("completed running `samtools faidx` of %s" % reference_fasta_name)
    return fai_path
def picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger):
    sd_dir = os.path.dirname(reference_fasta_name)
    ref_name = os.path.basename(reference_fasta_name)
    ref_base, ref_ext = os.path.splitext(ref_name)
    sd_file = ref_base + ".dict"
    sd_file_path = os.path.join(sd_dir, sd_file)
    if os.path.isfile(sd_file_path):
        logger.info("reference_dict_path=%s" % sd_file_path)
    else:
        sd_file_path = picard_CreateSequenceDictionary(uuid, reference_fasta_name, engine, logger)
        logger.info("reference_dict_path=%s" % sd_file_path)
    srt_dir = os.path.dirname(muse_vcf)
    vcf_name = os.path.basename(muse_vcf)
    vcf_base, vcf_ext = os.path.splitext(vcf_name)
    srt_vcf = vcf_base + ".srt" + vcf_ext
    srt_vcf_path = os.path.join(srt_dir, srt_vcf)
    if pipe_util.already_step(srt_dir, vcf_name + "_sorted", logger):
        logger.info("already completed step `Picard SortVcf` of %s" % muse_vcf)
    else:
        logger.info("running step `Picard SortVcf` of %s" % muse_vcf)
        home_dir = os.path.expanduser("~")
        picard_path = os.path.join(home_dir, "tools/picard-tools/picard.jar")
        cmd = [
            "java",
            "-d64",
            "-Xmx16G",
            "-jar",
            picard_path,
            "SortVcf",
            "I=" + muse_vcf,
            "O=" + srt_vcf_path,
            "SD=" + sd_file_path,
        ]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["MuSE_VCF"] = muse_vcf
        df["MuSE_sorted_VCF"] = srt_vcf_path
        unique_key_dict = {"uuid": uuid, "MuSE_VCF": muse_vcf, "MuSE_sorted_VCF": srt_vcf_path}
        table_name = "time_mem_picard_SortVcf"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        logger.info("completed running step `Picard SortVcf` of %s" % muse_vcf)
        pipe_util.create_already_step(srt_dir, vcf_name + "_sorted", logger)
    return srt_vcf_path
Beispiel #24
0
def get_s3_objects(uuid,bucket,name,destination,s3cfg_dir,engine,logger):
    if pipe_util.already_have(destination,name,logger):
        logger.info('already have object(s) %s in %s' % (name,destination))
    else:
        logger.info('downloading object(s) %s to %s' % (name,destination))
        base_name=os.path.splitext(name)[0]
        s3_path=os.path.join('s3://',bucket,base_name)
        home_dir=os.path.expanduser('~')
        s3cmd_path=os.path.join(home_dir,'.local','bin','s3cmd')
        cmd=[s3cmd_path,'-c',os.path.join(s3cfg_dir,'.s3cfg'),'sync',s3_path,destination]
        output=pipe_util.do_command(cmd,logger)
        df=time_util.store_time(uuid,cmd,output,logger)
        df['bucket']=bucket
        df['name']=name
        table_name='time_mem_s3_sync'
        unique_key_dict={'bucket':bucket,'name':name}
        df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
        pipe_util.create_have(destination,name,logger)
        logger.info('finished downloading object(s) %s to %s' % (name,destination))
    return
Beispiel #25
0
def do_fastqc(uuid, fastq_path, engine, logger):
    fastq_name = os.path.basename(fastq_path)
    fastq_dir = os.path.dirname(fastq_path)
    fastq_base, fastq_ext = os.path.splitext(fastq_name)
    if pipe_util.already_step(fastq_dir, 'fastqc_' + fastq_base, logger):
        logger.info('already completed step `fastqc`: %s' % fastq_path)
    else:
        logger.info('running step `fastqc`: %s' % fastq_path)
        cmd = ['/home/ubuntu/tools/FastQC/fastqc', '--extract',
               fastq_path]  # fix the path here
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df['fastq_path'] = fastq_path
        table_name = 'time_mem_fastqc'
        unique_key_dict = {'uuid': uuid, 'fastq_path': fastq_path}
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(fastq_dir, 'fastqc_' + fastq_base,
                                      logger)
    return
Beispiel #26
0
def bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger):
    uuid_dir=os.path.dirname(bam_path)
    realn_dir=os.path.join(uuid_dir,'realn')
    out_bam_path_list=list()
    input_thread_count=str(int(int(thread_count)/2))
    output_thread_count=input_thread_count
    logger.info('bamsort input_thread_count=%s' % input_thread_count)
    logger.info('bamsort output_thread_count=%s' % output_thread_count)
    for input_bam in bam_path_list:
        bam_name=os.path.basename(input_bam)
        bam_base,bam_ext=os.path.splitext(bam_name)
        input_dir=os.path.dirname(input_bam)
        #indir_name=input_dir.split('/')[-1]
        #outdir_name=indir_name+'_sorted'
        outdir_path=os.path.join(input_dir,'sorted')
        outbam_path=os.path.join(outdir_path,bam_name)
        tmpfile=os.path.join(outdir_path,'tmpfile_'+bam_name)
        logger.info('outbam_path=%s' % outbam_path)
        out_bam_path_list.append(outbam_path)
        if pipe_util.already_step(outdir_path,'sort_'+bam_base,logger):
            logger.info('already completed step `sort` of: %s' % bam_name)
        else:
            logger.info('running step `sort` of: %s' % bam_name)
            os.makedirs(outdir_path,exist_ok=True)
            cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'calmdnm=1','calmdnmreference='+reference_fasta_path,'calmdnmrecompindetonly=1','tmpfile='+tmpfile,'index=1']
            #cmd=['bamsort','I='+input_bam,'O='+outbam_path,'inputthreads='+input_thread_count,'outputthreads='+output_thread_count,'tmpfile='+tmpfile]
            output=pipe_util.do_command(cmd,logger)
            df=time_util.store_time(uuid,cmd,output,logger)
            df['bam_path']=bam_path
            df['reference_fasta_path']=reference_fasta_path
            df['thread_count']=thread_count
            unique_key_dict={'uuid':uuid,'bam_path':bam_path,'reference_fasta_path':reference_fasta_path,
                             'thread_count':thread_count}
            table_name='time_mem_bamsort'
            df_util.save_df_to_sqlalchemy(df,unique_key_dict,table_name,engine,logger)
            pipe_util.create_already_step(outdir_path,'sort_'+bam_base,logger)
            logger.info('completed running step `merge` of: %s' % bam_name)
    return out_bam_path_list
def tabix_index(uuid, dbsnp_known_snp_sites, engine, logger):
    dbsnp_file = os.path.basename(dbsnp_known_snp_sites)
    dbsnp_tbi_path = dbsnp_known_snp_sites + ".tbi"
    out_dir = os.path.dirname(dbsnp_known_snp_sites)
    if pipe_util.already_step(out_dir, dbsnp_file + "_tbi", logger):
        logger.info("already completed step `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    else:
        logger.info("running step `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
        cmd = ["tabix", "-p", "vcf", dbsnp_known_snp_sites]
        output = pipe_util.do_command(cmd, logger)
        df = time_util.store_time(uuid, cmd, output, logger)
        df["dbsnp_known_snp_sites"] = dbsnp_known_snp_sites
        df["dbsnp_tbi_path"] = dbsnp_tbi_path
        unique_key_dict = {
            "uuid": uuid,
            "dbsnp_known_snp_sites": dbsnp_known_snp_sites,
            "dbsnp_tbi_path": dbsnp_tbi_path,
        }
        table_name = "time_mem_tabix_index_dbsnp_bgz"
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(out_dir, dbsnp_file + "_tbi", logger)
        logger.info("completed running `tbi index of dbsnp.vcf` of %s" % dbsnp_known_snp_sites)
    return dbsnp_tbi_path
Beispiel #28
0
def bam_merge(uuid, preharmonize_bam_path, bam_path_list, engine, logger, be_lenient):
    sorted_bam_dir = os.path.dirname(bam_path_list[0])
    bwa_alignment_dir = os.path.dirname(sorted_bam_dir)
    realn_dir = os.path.dirname(bwa_alignment_dir)
    out_dir = os.path.join(realn_dir, 'merge')
    os.makedirs(out_dir, exist_ok=True)
    step_dir = out_dir
    preharmbam = os.path.basename(preharmonize_bam_path)
    preharmbam_name, preharmbam_ext = os.path.splitext(preharmbam)
    outbam_name = preharmbam_name + '_gdc_realn.bam'
    outbam_path = os.path.join(out_dir, outbam_name)
    logger.info('bam_path_list=%s' % bam_path_list)
    lenient_merge = False
    if pipe_util.already_step(step_dir, 'picard_merge', logger):
        logger.info('already completed step `merge` of: %s' % outbam_path)
    else:
        logger.info('running step `picard merge of: %s' % outbam_path)
        #tmpfile=os.path.join(merge_dir,'tmpfile')
        home_dir = os.path.expanduser('~')
        cmd = ['java', '-d64', '-jar', os.path.join(home_dir, 'tools/picard-tools/picard.jar'), 'MergeSamFiles', 'USE_THREADING=true', 'ASSUME_SORTED=true', 'SORT_ORDER=coordinate', 'OUTPUT=' + outbam_path, 'TMP_DIR=' + out_dir]
        for input_bam in bam_path_list:
            input_string = 'INPUT=' + input_bam
            cmd.append(input_string)
        if be_lenient:
            cmd.append('VALIDATION_STRINGENCY=LENIENT')
        output = pipe_util.do_command(cmd, logger)

        #save time/mem to db
        df = time_util.store_time(uuid, cmd, output, logger)
        df['bam_path'] = outbam_path
        unique_key_dict = {'uuid': uuid, 'bam_name': outbam_name}
        table_name = 'time_mem_picard_bam_merge'
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
        pipe_util.create_already_step(step_dir, 'picard_merge', logger)
        logger.info('completed running step `picard merge` of: %s' % outbam_path)
    return outbam_path
Beispiel #29
0
def main():
    parser = argparse.ArgumentParser("Graph generation", description="Generate graphs for different miRNA stats")

    # Logging flag
    parser.add_argument(
        "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging."
    )
    parser.set_defaults(level=logging.INFO)

    # Required flags
    parser.add_argument("-s", "--sam_path", required=True, help="Path to SAM file")
    parser.add_argument("-f", "--filtered_taglen", required=True, help="Path to filtered_taglength.csv")
    parser.add_argument("-v", "--softclip_taglen", required=True, help="Path to softclip_taglength.csv")
    parser.add_argument("-a", "--adapter_report", required=True, help="Path to adapter report")
    parser.add_argument("-c", "--chastity_taglen", required=True, help="Path to chastity_taglength.csv")
    parser.add_argument("-l", "--alignment_stats", required=True, help="Path to alignment_stats.csv")
    parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.")
    parser.add_argument("-r", "--barcode", required=True, help="BAM barcode")

    # Optional DB Flags
    parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file")
    parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.")

    args = parser.parse_args()

    sam_path = args.sam_path
    filtered_taglen = args.filtered_taglen
    softclip_taglen = args.softclip_taglen
    adapter_report = args.adapter_report
    chastity_taglen = args.chastity_taglen
    alignment_stats = args.alignment_stats
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging("mir_profiler_graph", args, uuid)

    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else:  # local sqllite case
        sqlite_name = "mir_profiler_graph" + uuid + ".db"
        engine_path = "sqlite:///" + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE")

    # Generate the graphs for the annotation data
    logger.info("Beginning: Annotation graph generation")
    graph_CMD = [
        "perl",
        "/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/graph_libs.pl",
        "-s",
        sam_path,
        "-f",
        filtered_taglen,
        "-o",
        softclip_taglen,
        "-a",
        adapter_report,
        "-c",
        chastity_taglen,
        "-t",
        alignment_stats,
    ]
    output = pipe_util.do_command(graph_CMD, logger)
    df = time_util.store_time(uuid, graph_CMD, output, logger)
    df["bam_name"] = barcode
    unique_key_dict = {"uuid": uuid, "bam_name": barcode}
    table_name = "time_mem_mir_graph"
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    # Store time command will go here
    logger.info("Completed: Annotation graph generation")
def main():
    parser = argparse.ArgumentParser('SAM Annotator', description = 'Annotates the SAM files with miRNA hits',)

    # Logging flag
    parser.add_argument('-d', '--debug',
                        action = 'store_const',
                        const = logging.DEBUG,
                        dest = 'level',
                        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags
    parser.add_argument('-e', '--species_code',
                        required = True,
                        choices = ['hsa'],
                        help = 'Organism species code.',
    )
    parser.add_argument('-s', '--sam_path',
                        required = True,
                        help = 'Path to directory containing bams.',
    )
    parser.add_argument('-w', '--db_connect',                  
                        required = True,
                        help = 'Path to db_connection file',                  
    )
    parser.add_argument('-u', '--uuid',
                        required = True,
                        help = 'UUID/GDC_ID for the harmonized BAM.',
    )
    parser.add_argument('-r', '--barcode',
                        required = True,
                        help = 'BAM barcode',
    )
    

    # Optional DB Flags
    parser.add_argument('-y', '--db_cred_s3url',
                        required = False,
                        help = 'String s3url of the postgres db_cred file',
    )
    parser.add_argument('-z', '--s3cfg_path',
                        required = False,
                        help = 'Path to the s3cfg file.',
    )
    
    args = parser.parse_args()

    species_code = args.species_code
    sam_path = args.sam_path
    connect_path = args.db_connect
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging('mir_profiler_annotator', args, uuid)

    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else: #local sqllite case
        sqlite_name = 'mir_profiler_annotator' + uuid + '.db'
        engine_path = 'sqlite:///' + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')

    # Annotate the SAM files
    logger.info('Beginning: SAM file annotation')
    annotate_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/annotation/annotate.pl', '-d', connect_path, '-o', species_code, '-s', sam_path]
    output = pipe_util.do_command(annotate_CMD, logger)
    df = time_util.store_time(uuid, annotate_CMD, output, logger)
    df['bam_name'] = barcode
    unique_key_dict = {'uuid': uuid, 'bam_name': barcode}
    table_name = 'time_mem_mir_sam_annotator'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info('Completed: SAM file annotation')
def main():
    parser = argparse.ArgumentParser('miRNA matrix mimat development', description = 'Mature miRNA gene expression matrix genreation',)

    # Logging flag
    parser.add_argument('-d', '--debug',
                        action = 'store_const',
                        const = logging.DEBUG,
                        dest = 'level',
                        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags
    parser.add_argument('-w', '--db_connect',
                        required = True,
                        help = 'Name of desired miRbase.',
    )
    parser.add_argument('-e', '--species_code',
                        required = True,
                        choices = ['hsa'],
                        help = 'Organism species code.',
    )
    parser.add_argument('-s', '--sam_path',
                        required = True,
                        help = 'Path to SAM file',
    )
    parser.add_argument('-m', '--mirna_path',
                        required = True,
                        help = 'Path to miRNA.txt file',
    )
    parser.add_argument('-x', '--crossmapped_path',
                        required = True,
                        help = 'Path to crossmapped.txt file',
    )
    parser.add_argument('-u', '--uuid',
                        required = True,
                        help = 'UUID/GDC_ID for the harmonized BAM.',
    )
    parser.add_argument('-r', '--barcode',
                        required = True,
                        help = 'BAM barcode',
    )
    

    # Optional DB Flags
    parser.add_argument('-y', '--db_cred_s3url',
                        required = False,
                        help = 'String s3url of the postgres db_cred file',
    )
    parser.add_argument('-z', '--s3cfg_path',
                        required = False,
                        help = 'Path to the s3cfg file.',
    )

    args = parser.parse_args()

    db_connect = args.db_connect
    species_code = args.species_code
    sam_path = args.sam_path
    mirna_path = args.mirna_path
    crossmapped_path = args.crossmapped_path
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging('mir_profiler_mimat', args, uuid)
    
    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else: #local sqllite case
        sqlite_name = 'mir_profiler_mimat' + uuid + '.db'
        engine_path = 'sqlite:///' + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')

    # Get stats from the alignment annotations
    logger.info('Beginning: Mature miRNA gene expression matrix genreation')
    mimat_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/expression_matrix_mimat.pl', '-d', db_connect, '-o', species_code, '-s', sam_path, '-r', mirna_path, '-c', crossmapped_path]
    output = pipe_util.do_command(mimat_CMD, logger)
    df = time_util.store_time(uuid, mimat_CMD, output, logger)
    df['bam_name'] = barcode
    unique_key_dict = {'uuid': uuid, 'bam_name': barcode}
    table_name = 'time_mem_mir_expn_mimat'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info('Completed: Mature miRNA gene expression matrix genreation')
Beispiel #32
0
bam_analysis_id=bam_analysis_id.strip('/')
scratch_dir=args['scratch_dir']
thread_count=args['thread_count']



def get_s3_objects(uuid,bucket,name,destination,logger):
    #sync_name=name.split('.')[0]#temp hack to get reference.dict needed by GATK UG/HC
    if pipe_util.already_have(destination,name,logger):
        logger.info('already have object(s) %s in %s' % (name,destination))
    else:
        logger.info('downloading object(s) %s to %s' % (name,destination))
        base_name=os.path.splitext(name)[0]
        s3_path=os.path.join('s3://',bucket,base_name)
        cmd=['s3cmd','sync',s3_path,destination]
        output=pipe_util.do_command(cmd,logger)
        pipe_util.create_have(destination,name,logger)
        df=time_util.store_time(uuid,cmd,output,logger)
        


def main():
    ##logging
    uuid=pipe_util.get_uuid_from_path(bam_analysis_id)
    logging.basicConfig(filename='vcf_'+uuid+'.log',level=logging.DEBUG,filemode='a',
                        format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z')
    logger=logging.getLogger(__name__)

    ##open stats and timing db
    home_dir=os.path.expanduser('~')
    db_path=os.path.join(home_dir,'vcf_pipe.sqlite')
Beispiel #33
0
def main():
    parser = argparse.ArgumentParser("TCGA", description="TCGA formatted results generation")

    # Logging flag
    parser.add_argument(
        "-d", "--debug", action="store_const", const=logging.DEBUG, dest="level", help="Enable debug logging."
    )
    parser.set_defaults(level=logging.INFO)

    # Required flags
    parser.add_argument("-w", "--db_connect", required=True, help="Name of desired miRbase.")
    parser.add_argument("-g", "--genome_version", required=True, choices=["hg38"], help="Genome Version of Annotation.")
    parser.add_argument("-e", "--species_code", required=True, choices=["hsa"], help="Organism species code.")
    parser.add_argument("-s", "--sam_path", required=True, help="Path to directory containing bams.")
    parser.add_argument("-p", "--mirna_species", required=True, help="Path to mirna_species.txt")
    parser.add_argument("-x", "--crossmapped", required=True, help="Path to crossmapped.txt")
    parser.add_argument("-i", "--isoforms", required=True, help="Path to isoforms.txt")
    parser.add_argument("-u", "--uuid", required=True, help="UUID/GDC_ID for the harmonized BAM.")
    parser.add_argument("-r", "--barcode", required=True, help="BAM barcode")

    # Optional DB Flags
    parser.add_argument("-y", "--db_cred_s3url", required=False, help="String s3url of the postgres db_cred file")
    parser.add_argument("-z", "--s3cfg_path", required=False, help="Path to the s3cfg file.")

    args = parser.parse_args()

    connect_path = args.db_connect
    genome_version = args.genome_version
    species_code = args.species_code
    sam_path = args.sam_path
    mirna_species = args.mirna_species
    crossmapped = args.crossmapped
    isoforms = args.isoforms
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging("mir_profiler_tcga", args, uuid)

    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else:  # local sqllite case
        sqlite_name = "mir_profiler_tcga" + uuid + ".db"
        engine_path = "sqlite:///" + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level="SERIALIZABLE")

    # Generate TCGA formatted results
    logger.info("Beginning: TCGA formatted results generation")
    tcga_CMD = [
        "perl",
        "/home/ubuntu/bin/mirna-profiler/v0.2.7/code/custom_output/tcga/tcga.pl",
        "-d",
        connect_path,
        "-o",
        species_code,
        "-g",
        genome_version,
        "-s",
        sam_path,
        "-r",
        mirna_species,
        "-c",
        crossmapped,
        "-i",
        isoforms,
    ]
    output = pipe_util.do_command(tcga_CMD, logger)
    df = time_util.store_time(uuid, tcga_CMD, output, logger)
    df["bam_name"] = barcode
    unique_key_dict = {"uuid": uuid, "bam_name": barcode}
    table_name = "time_mem_mir_tcga"
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info("Completed: TCGA formatted results generation")
Beispiel #34
0
def do_samtools_flagstat(uuid, bam_path, reference_fasta_path, engine, logger):
    step_dir = os.path.dirname(bam_path)
    bam_name = os.path.basename(bam_path)
    bam_base, bam_ext = os.path.splitext(bam_name)
    flagstat_outfile = 'samtools_flagstat_' + bam_base + '.txt'
    flagstat_path = os.path.join(step_dir, flagstat_outfile)

    if pipe_util.already_step(step_dir, 'samtools_flagstat_' + bam_base,
                              logger):
        logger.info('already completed step `samtools flagstat of: %s' %
                    bam_path)
    else:
        logger.info('running step stat of: %s' % bam_path)
        cmd = ['samtools', 'flagstat', bam_path]
        flagstat_output = pipe_util.do_command(cmd, logger)
        with open(flagstat_path, 'w') as flagstat_path_open:
            for aline in flagstat_output.decode().format():
                flagstat_path_open.write(aline)
        #save time/mem to db
        df = time_util.store_time(uuid, cmd, flagstat_output, logger)
        df['bam_path'] = bam_path
        df['reference_fasta_path'] = reference_fasta_path
        table_name = 'time_mem_samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base, logger)
        logger.info('completed running step `samtools flagstat` of: %s' %
                    bam_path)

    #save stats to db
    if pipe_util.already_step(step_dir,
                              'samtools_flagstat_' + bam_base + '_db', logger):
        logger.info('already stored `samtools flagstat` of %s to db' %
                    bam_path)
    else:
        data_dict = samtools_flagstat_to_dict(uuid, bam_path, flagstat_path,
                                              logger)
        data_dict['uuid'] = [uuid]
        data_dict['bam_path'] = bam_path
        data_dict['reference_fasta_path'] = reference_fasta_path
        df = pd.DataFrame(data_dict)
        table_name = 'samtools_flagstat'
        unique_key_dict = {
            'uuid': uuid,
            'bam_path': bam_path,
            'reference_fasta_path': reference_fasta_path
        }
        df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine,
                                      logger)
        pipe_util.create_already_step(step_dir,
                                      'samtools_flagstat_' + bam_base + '_db',
                                      logger)
        logger.info('completed storing `samtools flagstat` of %s to db' %
                    bam_path)
    return
def main():
    parser = argparse.ArgumentParser('SAM alignment stats', description = 'Generate alignment stats for the miRNA in the annotated SAM file',)

    # Logging flag
    parser.add_argument('-d', '--debug',
                        action = 'store_const',
                        const = logging.DEBUG,
                        dest = 'level',
                        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags
    parser.add_argument('-s', '--sam_path',
                        required = True,
                        help = 'Path to SAM file',
    )
    parser.add_argument('-a', '--adapter_path',
                        required = True,
                        help = 'Path to adapter report',
    )
    parser.add_argument('-u', '--uuid',
                        required = True,
                        help = 'UUID/GDC_ID for the harmonized BAM.',
    )
    parser.add_argument('-r', '--barcode',
                        required = True,
                        help = 'BAM barcode',
    )
    # Optional DB Flags
    parser.add_argument('-y', '--db_cred_s3url',
                        required = False,
                        help = 'String s3url of the postgres db_cred file',
    )
    parser.add_argument('-z', '--s3cfg_path',
                        required = False,
                        help = 'Path to the s3cfg file.',
    )
    
    args = parser.parse_args()

    sam_path = args.sam_path
    adapter_path = args.adapter_path
    uuid = args.uuid
    barcode = args.barcode

    if args.db_cred_s3url:
        db_cred_s3url = args.db_cred_s3url
        s3cfg_path = args.s3cfg_path
    else:
        db_cred_s3url = None

    logger = pipe_util.setup_logging('mir_profiler_stats', args, uuid)
    
    if db_cred_s3url is not None:
        conn_dict = pipe_util.get_connect_dict(db_cred_s3url, s3cfg_path, logger)
        engine = sqlalchemy.create_engine(sqlalchemy.engine.url.URL(**conn_dict))
    else: #local sqllite case
        sqlite_name = 'mir_profiler_stats' + uuid + '.db'
        engine_path = 'sqlite:///' + sqlite_name
        engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')

    # Get stats from the alignment annotations
    logger.info('Beginning: Alignment stats generation')
    stats_CMD = ['perl', '/home/ubuntu/bin/mirna-profiler/v0.2.7/code/library_stats/alignment_stats.pl', '-s', sam_path, '-a', adapter_path]
    output = pipe_util.do_command(stats_CMD, logger)
    df = time_util.store_time(uuid, stats_CMD, output, logger)
    df['bam_name'] = barcode
    unique_key_dict = {'uuid': uuid, 'bam_name': barcode}
    table_name = 'time_mem_mir_alignment_stats'
    df_util.save_df_to_sqlalchemy(df, unique_key_dict, table_name, engine, logger)
    logger.info('Completed: Alignment stats generation')
Beispiel #36
0
def get_file_size(uuid,file_path,engine,logger):
    cmd=['ls','-l',file_path]
    output=pipe_util.do_command(cmd,logger)
    filesize=output.split()[4].decode()
    logger.info('%s filesize=%s' % (file_path,filesize))
    return filesize