def main():
    parser = argparse.ArgumentParser('Broad cocleaning (Inderrealignment and BQSR) pipeline')

    # Logging flags.
    parser.add_argument('-d', '--debug',
        action = 'store_const',
        const = logging.DEBUG,
        dest = 'level',
        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags.

    parser.add_argument('-r', '--reference_fasta_name',
                        required = True,
                        help = 'Reference fasta path.',
    )
    parser.add_argument('-indel','--known_1k_genome_indel_sites',
                        required=True,
                        help='Reference INDEL path.',
    )
    parser.add_argument('-snp','--dbsnp_known_snp_sites',
                        required=True,
                        help='Reference SNP path.',
    )
    parser.add_argument('-b', '--harmonized_bam_path',
                        required = False,
                        action="append",
                        help = 'Source bam path.',
    )
    parser.add_argument('-list', '--harmonized_bam_list_path',
                        required = False,
                        help = 'Source bam list path.',
    )
    parser.add_argument('-s', '--scratch_dir',
                        required = False,
                        type = is_dir,
                        help = 'Scratch file directory.',
    )
    parser.add_argument('-l', '--log_dir',
                        required = False,
                        type = is_dir,
                        help = 'Log file directory.',
    )
    parser.add_argument('-j', '--thread_count',
                        required = True,
                        type = is_nat,
                        help = 'Maximum number of threads for execution.',
    )
    parser.add_argument('-u', '--uuid',
                        required = True,
                        help = 'analysis_id string',
    )
    parser.add_argument('-m', '--md5',
                        required = False,
                        action = 'store_true',
                        help = 'calculate final size/MD5',
    )
    parser.add_argument('-e', '--eliminate_intermediate_files',
                        required = False,
                        action = 'store_true',
                        help = 'do not (really) reduce disk usage. set if you want to use more disk space!'
    )

    args = parser.parse_args()
    reference_fasta_name = args.reference_fasta_name
    known_1k_genome_indel_sites = args.known_1k_genome_indel_sites
    dbsnp_known_snp_sites = args.dbsnp_known_snp_sites
    uuid = args.uuid
    harmonized_bam_path = args.harmonized_bam_path
    if not args.harmonized_bam_list_path:
        list_dir = os.path.dirname(harmonized_bam_path[0])
        harmonized_bam_list_path = os.path.join(list_dir, uuid + '_harmonized_bam_list.list')
        with open(harmonized_bam_list_path, "w") as handle:
            for bam in harmonized_bam_path:
                handle.write(bam + "\n")
    else:
        harmonized_bam_list_path = args.harmonized_bam_list_path

    if not args.scratch_dir:
        scratch_dir = os.path.dirname(harmonized_bam_list_path)
    else:
        scratch_dir = args.scratch_dir
    if not args.log_dir:
        log_dir = os.path.dirname(harmonized_bam_list_path)
    else:
        log_dir = args.log_dir
    thread_count = str(args.thread_count)
    if not args.eliminate_intermediate_files:
        eliminate_intermediate_files = True
    else:
        eliminate_intermediate_files = False
    if not args.md5:
        md5 = False
    else:
        md5 = True

    ##logging
    logging.basicConfig(
        filename=os.path.join(log_dir, 'Broad_cocleaning_' + uuid + '.log'),  # /host for docker
        level=args.level,
        filemode='a',
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d_%H:%M:%S_%Z',
    )
    logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger = logging.getLogger(__name__)
    hostname = os.uname()[1]
    logger.info('hostname=%s' % hostname)
    logger.info('harmonized_bam_list_path=%s' % harmonized_bam_list_path)
    if not args.harmonized_bam_path:
        with open(harmonized_bam_list_path) as f:
            harmonized_bam_path = f.read().splitlines()
            for path in harmonized_bam_path:
                logger.info('harmonized_bam_path=%s' % path)
    else:
        for path in harmonized_bam_path:
            logger.info('harmonized_bam_path=%s' % path)

    engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_Broad_cocleaning.db')
    engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')
    
    ##Pipeline
    #check .bai file, call samtools index if not exist
    RealignerTargetCreator.index(uuid, harmonized_bam_list_path, engine, logger)
    
    #call RealignerTargetCreator for harmonized bam list
    harmonized_bam_intervals_path = RealignerTargetCreator.RTC(uuid, harmonized_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger)
    
    #call IndelRealigner together but save the reads in the output coresponding to the input that the read came from.
    harmonized_IR_bam_list_path = IndelRealigner.IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger)
    
    #call BQSR table individually and apply it on bam
    Analysis_ready_bam_list_path = []
    for bam in harmonized_IR_bam_list_path:
        harmonized_IR_bam_BQSR_table_path = BaseRecalibrator.BQSR(uuid, bam, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger)
        Analysis_ready_bam_path = PrintReads.PR(uuid, bam, thread_count, reference_fasta_name, harmonized_IR_bam_BQSR_table_path, engine, logger)
        bam_validate.bam_validate(uuid, Analysis_ready_bam_path, engine, logger)
        Analysis_ready_bam_list_path.append(Analysis_ready_bam_path)
    
    if md5:
        for bam in Analysis_ready_bam_list_path:
            bam_name = os.path.basename(bam)
            bam_dir = os.path.dirname(bam)
            bam_basename, bam_ext = os.path.splitext(bam_name)
            bai_name = bam_basename + '.bai'
            bai_path = os.path.join(bam_dir, bai_name)
            verify_util.store_md5_size(uuid, bam, engine, logger)
            verify_util.store_md5_size(uuid, bai_path, engine, logger)
    
    if eliminate_intermediate_files:
            pipe_util.remove_file_list(uuid, harmonized_IR_bam_list_path, engine, logger)
    
    for bam in Analysis_ready_bam_list_path:
        validate_file = bam_validate.bam_validate(uuid, bam, engine, logger)
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser('miRNA harmonization')

    # Logging flag
    parser.add_argument(
        '-d',
        '--debug',
        action='store_const',
        const=logging.DEBUG,
        dest='level',
        help='Enable debug logging.',
    )
    parser.set_defaults(level=logging.INFO)

    # Required flags
    parser.add_argument(
        '-r',
        '--reference_fasta_path',
        required=False,
        help='Reference fasta path.',
    )
    parser.add_argument(
        '-b',
        '--bam_path',
        nargs='?',
        default=[sys.stdin],
        help='Source bam path.',
    )
    parser.add_argument(
        '-l',
        '--log_dir',
        required=False,
        type=is_dir,
        help='Log file directory.',
    )
    parser.add_argument(
        '-u',
        '--uuid',
        required=False,
        help='analysis_id string',
    )
    args = parser.parse_args()

    reference_fasta_path = args.reference_fasta_path
    preharmonized_bam_path = args.bam_path
    log_dir = args.log_dir

    uuid = args.uuid

    # Logging Setup
    logging.basicConfig(
        filename=os.path.join(log_dir, 'aln_' + uuid + '.log'),
        filemode='a',
        level=args.level,
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d_%H:%M:%S_%Z',
    )
    logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger = logging.getLogger(__name__)
    hostname = os.uname()[1]
    logger.info('hostname=%s' % hostname)
    logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path)

    engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db')
    engine = sqlalchemy.create_engine(engine_path,
                                      isolation_level='SERIALIZABLE')

    bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger)
    bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path,
                                   reference_fasta_path, engine, logger)
    readgroup_path_dict = bam_util.write_readgroups(uuid,
                                                    preharmonized_bam_path,
                                                    engine, logger)
    bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger)

    top_dir = os.path.dirname(preharmonized_bam_path)
    fastq_dir = os.path.join(top_dir, 'fastq')

    fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger)

    # Harmonization
    be_lenient = False
    harmonized_readgroup_bam_path_list = bam_util.bwa(uuid,
                                                      preharmonized_bam_path,
                                                      reference_fasta_path,
                                                      readgroup_path_dict,
                                                      engine, logger)

    fastq_list = fastq_util.buildfastqlist(fastq_dir, logger)
    fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list]

    for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list:
        if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger):
            be_lenient = True

    harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort(
        uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list,
        reference_fasta_path, engine, logger, be_lenient)

    harmonized_bam_merge_path = picard_bam_merge.bam_merge(
        uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine,
        logger, be_lenient)

    bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger)
    bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path,
                                   reference_fasta_path, engine, logger)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser('miRNA harmonization')

    # Logging flag
    parser.add_argument('-d', '--debug',
                        action = 'store_const',
                        const = logging.DEBUG,
                        dest = 'level',
                        help = 'Enable debug logging.',
    )
    parser.set_defaults(level = logging.INFO)

    # Required flags
    parser.add_argument('-r', '--reference_fasta_path',
                        required = False,
                        help = 'Reference fasta path.',
    )
    parser.add_argument('-b', '--bam_path',
                        nargs = '?',
                        default = [sys.stdin],
                        help = 'Source bam path.',
    )
    parser.add_argument('-l', '--log_dir',
                        required = False,
                        type = is_dir,
                        help = 'Log file directory.',
    )
    parser.add_argument('-u', '--uuid',
                        required = False,
                        help = 'analysis_id string',
    )
    args = parser.parse_args()

    reference_fasta_path = args.reference_fasta_path
    preharmonized_bam_path = args.bam_path
    log_dir = args.log_dir

    uuid = args.uuid

    # Logging Setup
    logging.basicConfig(
        filename = os.path.join(log_dir, 'aln_' + uuid + '.log'),
        filemode = 'a',
        level = args.level,
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d_%H:%M:%S_%Z',
    )
    logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger = logging.getLogger(__name__)
    hostname = os.uname()[1]
    logger.info('hostname=%s' % hostname)
    logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path)

    engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db')
    engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE')

    
    bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger)
    bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path, reference_fasta_path, engine, logger)
    readgroup_path_dict = bam_util.write_readgroups(uuid, preharmonized_bam_path, engine, logger)    
    bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger)

    top_dir = os.path.dirname(preharmonized_bam_path)
    fastq_dir = os.path.join(top_dir, 'fastq')
    
    fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger)

    # Harmonization
    be_lenient = False
    harmonized_readgroup_bam_path_list = bam_util.bwa(uuid, preharmonized_bam_path, reference_fasta_path, readgroup_path_dict, engine, logger)

    fastq_list = fastq_util.buildfastqlist(fastq_dir, logger)
    fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list]
                             
    for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list:
        if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger):
            be_lenient = True
                
    harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort(uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list, reference_fasta_path, engine, logger, be_lenient)

    harmonized_bam_merge_path = picard_bam_merge.bam_merge(uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine, logger, be_lenient)

    bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger)
    bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path, reference_fasta_path, engine, logger)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser('harmonization pipeline')

    # Logging flags.
    parser.add_argument('-d','--debug',
        action='store_const',
        const=logging.DEBUG,
        dest='level',
        help='Enable debug logging.',
    )
    parser.set_defaults(level=logging.INFO)

    # Required flags.
    parser.add_argument('-g','--s3_reference_bucket',
                        required=True,
                        help='S3 bucket name containing reference fasta.',
    )
    parser.add_argument('-b','--s3_bam_bucket',
                        required=True,
                        help='S3 bucket name containing source bam file.',
    )
    parser.add_argument('-r','--reference_fasta_name',
                        required=True,
                        help='Reference fasta S3 key name.',
    )
    parser.add_argument('-a','--bam_analysis_id',
                        required=True,
                        help='Source bam file S3 key name.',
    )
    parser.add_argument('-s','--scratch_dir',
                        required=True,
                        type=is_dir,
                        help='Scratch file directory.',
    )
    parser.add_argument('-l','--log_dir',
                        required=True,
                        type=is_dir,
                        help='Log file directory.',
    )
    parser.add_argument('-f','--s3cfg_dir',
                        required=True,
                        type=is_dir,
                        help='.s3cfg file directory.',
    )
    parser.add_argument('-t','--thread_count',
                        required=True,
                        type=is_nat,
                        help='Maximum number of threads for execution.',
    )
    parser.add_argument('-c','--csv_stats',
                        required=True,
                        help='Write to csv rather than postgres'
    )
    parser.add_argument('-u','--postgres_user',
                        required=False,
                        help='postgres username'
    )
    parser.add_argument('-p','--postgres_password',
                        required=False,
                        help='postgres password'
    )
    parser.add_argument('-n','--postgres_hostname',
                        required=False,
                        help='postgres hostname'
    )

    args = parser.parse_args()
    s3_reference_bucket = args.s3_reference_bucket
    s3_bam_bucket = args.s3_bam_bucket
    reference_fasta_name = args.reference_fasta_name
    reference_fasta_name = reference_fasta_name.strip('/')
    bam_analysis_id = args.bam_analysis_id
    bam_analysis_id = bam_analysis_id.strip('/')
    scratch_dir = args.scratch_dir
    log_dir = args.log_dir
    s3cfg_dir= args.s3cfg_dir
    thread_count = str(args.thread_count)
    csv_stats = literal_eval(args.csv_stats)
    postgres_user = args.postgres_user
    postgres_password = args.postgres_password
    postgres_hostname = args.postgres_hostname


    ##logging
    uuid=pipe_util.get_uuid_from_path(bam_analysis_id)
    logging.basicConfig(
        filename=os.path.join(log_dir,'aln_'+uuid+'.log'), #/host for docker
        level=args.level,
        filemode='a',
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d_%H:%M:%S_%Z',
    )
    logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger=logging.getLogger(__name__)
    hostname=os.uname()[1]
    logger.info('hostname=%s' % hostname)


    ##open stats and timing db
    if not csv_stats:
        if (postgres_user is None) or (postgres_password is None) or (postgres_hostname is None):
            logger.debug('must enter postgres user, password and hostname if not writing to csv (or use -c True)')
            sys.exit(1)
        engine_path='postgresql://'+postgres_user+':'+postgres_password+'@'+postgres_hostname+'/gdc_harmonize'
        engine=sqlalchemy.create_engine(engine_path,isolation_level='SERIALIZABLE')

    
    ##get reference genome
    get_s3_objects(uuid,s3_reference_bucket,reference_fasta_name,scratch_dir,s3cfg_dir,engine,logger)
    
    reference_fasta_path=os.path.join(scratch_dir,reference_fasta_name)
    logger.info('scratch_dir=%s' % scratch_dir)
    logger.info('reference_fasta_name=%s' % reference_fasta_name)
    logger.info('reference_fasta_path=%s' % reference_fasta_path)
    
    ##get bam to be harmonized
    get_s3_objects(uuid,s3_bam_bucket,bam_analysis_id,scratch_dir,s3cfg_dir,engine,logger)
    bam_path=bam_util.get_bam_path(bam_analysis_id,scratch_dir,logger)
    #get original reference genome for stats
    original_fasta_name=bam_util.get_bam_reference(bam_path,logger)
    if not (original_fasta_name.endswith('.fa') or original_fasta_name.endswith('.fasta')):
        original_fasta_name+='.fa'
    original_fasta_path=os.path.join(scratch_dir,original_fasta_name)
    get_s3_objects(uuid,s3_reference_bucket,original_fasta_name,scratch_dir,s3cfg_dir,engine,logger)

    
    ##pipeline
    #verify_util.verify_cgquery_md5(reference_fasta_name,logger)
    verify_util.verify_cgquery(uuid,bam_path,engine,logger)
    bam_validate.bam_validate(uuid,bam_path,engine,logger)
    bam_stats.bam_stats(uuid,bam_path,original_fasta_path,engine,logger)
    bam_util.bam_to_fastq(uuid,bam_path,engine,logger)
    fastq_length=fastq_validate.fastqc_validate(uuid,bam_path,thread_count,engine,logger)
    readgroup_path_dict=bam_util.write_readgroups(uuid,bam_path,engine,logger)#to file and db

    
    #MEM_ALN_CUTOFF=70
    #if fastq_length<MEM_ALN_CUTOFF: # do bwa aln...
    #    bam_path_list=bwa.bwa(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger)
    #    bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger)
    #    bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger)
    #    bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger)
    #    bam_validate.bam_validate(uuid,bam_md_path,engine,logger)
    #    bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger)
    #    verify_util.store_md5_size(uuid,bam_md_path,engine,logger)
    #    bai_md_path=bam_md_path+'.bai'
    #    verify_util.store_md5_size(uuid,bai_md_path,engine,logger)
    
    # ...and do bwa mem
    bam_path_list=bwa_mem.bwa_mem(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger)
    bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger)
    bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger)
    bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger)
    bam_validate.bam_validate(uuid,bam_md_path,engine,logger)
    #bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger)
    verify_util.store_md5_size(uuid,bam_md_path,engine,logger)
    bai_md_path=bam_md_path+'.bai'
    verify_util.store_md5_size(uuid,bai_md_path,engine,logger)