Example #1
0
    home_dir=os.path.expanduser('~')
    db_path=os.path.join(home_dir,'vcf_pipe.sqlite')
    logger.info('db_path=%s' % db_path)
    engine_path='sqlite:///'+db_path
    logger.info('engine_path=%s' % engine_path)
    engine=sqlalchemy.create_engine(engine_path)
    conn=engine.connect()

    
    ##get reference genome
    get_s3_objects(uuid,s3_reference_bucket,reference_fasta_name,scratch_dir,logger)
    
    reference_fasta_path=os.path.join(scratch_dir,reference_fasta_name)
    logger.info('scratch_dir=%s' % scratch_dir)
    logger.info('reference_fasta_name=%s' % reference_fasta_name)
    logger.info('reference_fasta_path=%s' % reference_fasta_path)
    
    ##get bam to be called
    get_s3_objects(uuid,s3_bam_bucket,bam_analysis_id,scratch_dir,logger)
    bam_path=bam_util.get_bam_path(bam_analysis_id,scratch_dir,logger)

    
    ##pipeline
    #verify_util.verify_cgquery_md5(reference_fasta_name,logger)
    verify_util.verify_cgquery(uuid,bam_path,engine,logger)
    call_vcf.call_vcf(uuid,bam_path,reference_fasta_path,scratch_dir,engine,thread_count,logger)
    conn.close()

if __name__=='__main__':
    main()
Example #2
0
def main():
    parser = argparse.ArgumentParser('harmonization pipeline')

    # Logging flags.
    parser.add_argument('-d','--debug',
        action='store_const',
        const=logging.DEBUG,
        dest='level',
        help='Enable debug logging.',
    )
    parser.set_defaults(level=logging.INFO)

    # Required flags.
    parser.add_argument('-g','--s3_reference_bucket',
                        required=True,
                        help='S3 bucket name containing reference fasta.',
    )
    parser.add_argument('-b','--s3_bam_bucket',
                        required=True,
                        help='S3 bucket name containing source bam file.',
    )
    parser.add_argument('-r','--reference_fasta_name',
                        required=True,
                        help='Reference fasta S3 key name.',
    )
    parser.add_argument('-a','--bam_analysis_id',
                        required=True,
                        help='Source bam file S3 key name.',
    )
    parser.add_argument('-s','--scratch_dir',
                        required=True,
                        type=is_dir,
                        help='Scratch file directory.',
    )
    parser.add_argument('-l','--log_dir',
                        required=True,
                        type=is_dir,
                        help='Log file directory.',
    )
    parser.add_argument('-f','--s3cfg_dir',
                        required=True,
                        type=is_dir,
                        help='.s3cfg file directory.',
    )
    parser.add_argument('-t','--thread_count',
                        required=True,
                        type=is_nat,
                        help='Maximum number of threads for execution.',
    )
    parser.add_argument('-c','--csv_stats',
                        required=True,
                        help='Write to csv rather than postgres'
    )
    parser.add_argument('-u','--postgres_user',
                        required=False,
                        help='postgres username'
    )
    parser.add_argument('-p','--postgres_password',
                        required=False,
                        help='postgres password'
    )
    parser.add_argument('-n','--postgres_hostname',
                        required=False,
                        help='postgres hostname'
    )

    args = parser.parse_args()
    s3_reference_bucket = args.s3_reference_bucket
    s3_bam_bucket = args.s3_bam_bucket
    reference_fasta_name = args.reference_fasta_name
    reference_fasta_name = reference_fasta_name.strip('/')
    bam_analysis_id = args.bam_analysis_id
    bam_analysis_id = bam_analysis_id.strip('/')
    scratch_dir = args.scratch_dir
    log_dir = args.log_dir
    s3cfg_dir= args.s3cfg_dir
    thread_count = str(args.thread_count)
    csv_stats = literal_eval(args.csv_stats)
    postgres_user = args.postgres_user
    postgres_password = args.postgres_password
    postgres_hostname = args.postgres_hostname


    ##logging
    uuid=pipe_util.get_uuid_from_path(bam_analysis_id)
    logging.basicConfig(
        filename=os.path.join(log_dir,'aln_'+uuid+'.log'), #/host for docker
        level=args.level,
        filemode='a',
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d_%H:%M:%S_%Z',
    )
    logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger=logging.getLogger(__name__)
    hostname=os.uname()[1]
    logger.info('hostname=%s' % hostname)


    ##open stats and timing db
    if not csv_stats:
        if (postgres_user is None) or (postgres_password is None) or (postgres_hostname is None):
            logger.debug('must enter postgres user, password and hostname if not writing to csv (or use -c True)')
            sys.exit(1)
        engine_path='postgresql://'+postgres_user+':'+postgres_password+'@'+postgres_hostname+'/gdc_harmonize'
        engine=sqlalchemy.create_engine(engine_path,isolation_level='SERIALIZABLE')

    
    ##get reference genome
    get_s3_objects(uuid,s3_reference_bucket,reference_fasta_name,scratch_dir,s3cfg_dir,engine,logger)
    
    reference_fasta_path=os.path.join(scratch_dir,reference_fasta_name)
    logger.info('scratch_dir=%s' % scratch_dir)
    logger.info('reference_fasta_name=%s' % reference_fasta_name)
    logger.info('reference_fasta_path=%s' % reference_fasta_path)
    
    ##get bam to be harmonized
    get_s3_objects(uuid,s3_bam_bucket,bam_analysis_id,scratch_dir,s3cfg_dir,engine,logger)
    bam_path=bam_util.get_bam_path(bam_analysis_id,scratch_dir,logger)
    #get original reference genome for stats
    original_fasta_name=bam_util.get_bam_reference(bam_path,logger)
    if not (original_fasta_name.endswith('.fa') or original_fasta_name.endswith('.fasta')):
        original_fasta_name+='.fa'
    original_fasta_path=os.path.join(scratch_dir,original_fasta_name)
    get_s3_objects(uuid,s3_reference_bucket,original_fasta_name,scratch_dir,s3cfg_dir,engine,logger)

    
    ##pipeline
    #verify_util.verify_cgquery_md5(reference_fasta_name,logger)
    verify_util.verify_cgquery(uuid,bam_path,engine,logger)
    bam_validate.bam_validate(uuid,bam_path,engine,logger)
    bam_stats.bam_stats(uuid,bam_path,original_fasta_path,engine,logger)
    bam_util.bam_to_fastq(uuid,bam_path,engine,logger)
    fastq_length=fastq_validate.fastqc_validate(uuid,bam_path,thread_count,engine,logger)
    readgroup_path_dict=bam_util.write_readgroups(uuid,bam_path,engine,logger)#to file and db

    
    #MEM_ALN_CUTOFF=70
    #if fastq_length<MEM_ALN_CUTOFF: # do bwa aln...
    #    bam_path_list=bwa.bwa(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger)
    #    bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger)
    #    bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger)
    #    bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger)
    #    bam_validate.bam_validate(uuid,bam_md_path,engine,logger)
    #    bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger)
    #    verify_util.store_md5_size(uuid,bam_md_path,engine,logger)
    #    bai_md_path=bam_md_path+'.bai'
    #    verify_util.store_md5_size(uuid,bai_md_path,engine,logger)
    
    # ...and do bwa mem
    bam_path_list=bwa_mem.bwa_mem(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger)
    bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger)
    bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger)
    bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger)
    bam_validate.bam_validate(uuid,bam_md_path,engine,logger)
    #bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger)
    verify_util.store_md5_size(uuid,bam_md_path,engine,logger)
    bai_md_path=bam_md_path+'.bai'
    verify_util.store_md5_size(uuid,bai_md_path,engine,logger)